From 16f4096e47a61b6fa58edf27ffcb188186c02a86 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 28 Aug 2020 10:00:42 -0700 Subject: [PATCH 01/12] More export developments. (#1081) * Refactoring of _prepare_for_export Signed-off-by: Boris Fomitchev * Added export_utils from Meetkai Signed-off-by: Boris Fomitchev * Fixed copyright header Signed-off-by: Boris Fomitchev * Adding export to Waveglow Signed-off-by: Boris Fomitchev * Fixing megatron test cuda coherence Signed-off-by: Boris Fomitchev * Code formatting Signed-off-by: Boris Fomitchev * extra exportable options Signed-off-by: Boris Fomitchev --- .../nlp/modules/common/bert_module.py | 4 +- .../modules/common/megatron/megatron_bert.py | 2 + .../modules/common/megatron/megatron_utils.py | 1 + nemo/collections/tts/modules/__init__.py | 8 ++-- nemo/collections/tts/modules/waveglow.py | 21 ++++++-- nemo/core/classes/exportable.py | 10 ++-- nemo/utils/export_utils.py | 6 ++- tests/collections/nlp/test_megatron.py | 8 +++- tests/collections/tts/test_waveglow.py | 48 +++++++++++++++++++ 9 files changed, 91 insertions(+), 17 deletions(-) create mode 100644 tests/collections/tts/test_waveglow.py diff --git a/nemo/collections/nlp/modules/common/bert_module.py b/nemo/collections/nlp/modules/common/bert_module.py index a02fdf9b0453..9e766a09bcbd 100644 --- a/nemo/collections/nlp/modules/common/bert_module.py +++ b/nemo/collections/nlp/modules/common/bert_module.py @@ -93,6 +93,6 @@ def input_example(self): A tuple of input examples. """ sample = next(self.parameters()) - input_ids = torch.randint(low=0, high=16, size=(2, 16)).to(sample.device) - attention_mask = torch.randint(low=0, high=1, size=(2, 16)).to(sample.device) + input_ids = torch.randint(low=0, high=2048, size=(2, 16), device=sample.device) + attention_mask = torch.randint(low=0, high=1, size=(2, 16), device=sample.device) return tuple([input_ids, attention_mask, attention_mask]) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_bert.py b/nemo/collections/nlp/modules/common/megatron/megatron_bert.py index 1fbfe7501472..b665a5ae0f59 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_bert.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_bert.py @@ -54,6 +54,8 @@ def __init__(self, model_name, config, vocab_file): config['lazy_mpu_init'] = True + config['onnx_safe'] = True + # Initialize part of Megatron global state that is needed for its constructor. # We set 'lazy_mpu_init' flag on to make Megatron do only the initialization that does not depend # on ddp be initialized yet (and we don't want Megatron to initialize DDP itself either) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py index 93e4d15b2d64..f95530f00f95 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py @@ -100,6 +100,7 @@ def get_megatron_lm_model( checkpoint_file = get_megatron_checkpoint(pretrained_model_name) vocab = get_megatron_vocab_file(pretrained_model_name) + model = MegatronBertEncoder(model_name=pretrained_model_name, config=config, vocab_file=vocab) return model, checkpoint_file diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index fd74a50acda0..4c1f29c4565a 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import nemo.collections.tts.modules.glow_tts -import nemo.collections.tts.modules.tacotron2 -import nemo.collections.tts.modules.waveglow +from nemo.collections.tts.modules.glow_tts import GlowTTSModule +from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder +from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder +from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet +from nemo.collections.tts.modules.waveglow import WaveGlowModule diff --git a/nemo/collections/tts/modules/waveglow.py b/nemo/collections/tts/modules/waveglow.py index 40866ff0a2fd..22904d1b5901 100644 --- a/nemo/collections/tts/modules/waveglow.py +++ b/nemo/collections/tts/modules/waveglow.py @@ -16,7 +16,7 @@ import torch from nemo.collections.tts.modules.submodules import Invertible1x1Conv, WaveNet -from nemo.core.classes import NeuralModule, typecheck +from nemo.core.classes import Exportable, NeuralModule, typecheck from nemo.core.neural_types.elements import ( AudioSignal, IntType, @@ -37,7 +37,7 @@ class OperationMode(Enum): @experimental # TODO: Implement save_to() and restore_from() -class WaveGlowModule(NeuralModule): +class WaveGlowModule(NeuralModule, Exportable): def __init__( self, n_mel_channels: int, @@ -65,6 +65,7 @@ def __init__( super().__init__() self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, n_mel_channels, 1024, stride=256) + self.n_mel_channels = n_mel_channels assert n_group % 2 == 0 self.n_flows = n_flows self.n_group = n_group @@ -74,7 +75,7 @@ def __init__( self.convinv = torch.nn.ModuleList() self.mode = OperationMode.infer - n_half = int(n_group / 2) + n_half = n_group // 2 # Set up layers with the right sizes based on how many dimensions # have been output already @@ -96,7 +97,7 @@ def __init__( self.n_remaining_channels = n_remaining_channels @typecheck() - def forward(self, *, spect, audio=None, run_inverse=True): + def forward(self, spect, audio=None, run_inverse=True): """ TODO """ if self.training and self.mode != OperationMode.training: @@ -141,6 +142,16 @@ def output_types(self): "audio": NeuralType(('B', 'T'), AudioSignal()), } + def input_example(self): + """ + Generates input examples for tracing etc. + Returns: + A tuple of input examples. + """ + par = next(self.parameters()) + mel = torch.randn((1, self.n_mel_channels, 96), device=par.device, dtype=par.dtype) + return tuple([mel]) + def audio_to_normal_dist(self, *, spect: torch.Tensor, audio: torch.Tensor) -> (torch.Tensor, list, list): # Upsample spectrogram to size of audio spect = self.upsample(spect) @@ -195,7 +206,7 @@ def norm_dist_to_audio(self, *, spect, sigma: float = 1.0): ) for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1) / 2) + n_half = audio.size(1) // 2 audio_0 = audio[:, :n_half, :] audio_1 = audio[:, n_half:, :] diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index 9424a889aaad..aed0071c92c8 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -54,6 +54,8 @@ def export( onnx_opset_version: int = 12, try_script: bool = False, set_eval: bool = True, + check_trace: bool = True, + use_dynamic_axes: bool = True, ): try: # Disable typechecks @@ -92,13 +94,15 @@ def export( if _name in self.disabled_deployment_input_names: input_names.remove(_name) continue - dynamic_axes = {**dynamic_axes, **self._extract_dynamic_axes(_name, ntype)} + if use_dynamic_axes: + dynamic_axes = {**dynamic_axes, **self._extract_dynamic_axes(_name, ntype)} # for output_ports for _name, ntype in self.output_types.items(): if _name in self.disabled_deployment_output_names: output_names.remove(_name) continue - dynamic_axes = {**dynamic_axes, **self._extract_dynamic_axes(_name, ntype)} + if use_dynamic_axes: + dynamic_axes = {**dynamic_axes, **self._extract_dynamic_axes(_name, ntype)} if len(dynamic_axes) == 0: dynamic_axes = None @@ -117,7 +121,7 @@ def export( _in_example = tuple(_in_example.values()) if jitted_model is None: - jitted_model = torch.jit.trace(self, _in_example) + jitted_model = torch.jit.trace(self, _in_example, check_trace=check_trace) if format == ExportFormat.TORCHSCRIPT: jitted_model.save(output) diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index b4bc9bda8550..3f7da4a8d7fd 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -34,8 +34,10 @@ def replace_FusedLayerNorm(n: nn.Module) -> Optional[nn.BatchNorm2d]: """ if not apex_available or not isinstance(n, FusedLayerNorm): return None - # FusedLayerNorm could have only resided on CUDA - mod = nn.LayerNorm(n.normalized_shape, eps=n.eps, elementwise_affine=n.elementwise_affine,).cuda() + + dev = next(n.parameters()).device + mod = nn.LayerNorm(n.normalized_shape, eps=n.eps, elementwise_affine=n.elementwise_affine,).to(dev) + n_state = n.state_dict() mod.load_state_dict(n_state) return mod diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py index 2efacc6203b4..c018c72b9514 100644 --- a/tests/collections/nlp/test_megatron.py +++ b/tests/collections/nlp/test_megatron.py @@ -25,6 +25,7 @@ import onnx import pytest +import torch import nemo.collections.nlp as nemo_nlp @@ -40,10 +41,13 @@ def test_list_pretrained_models(self): @pytest.mark.unit def test_get_pretrained_bert_345m_uncased_model(self): model_name = "megatron-bert-345m-uncased" - model = nemo_nlp.modules.get_pretrained_lm_model(model_name).cuda() + model = nemo_nlp.modules.get_pretrained_lm_model(model_name) + if torch.cuda.is_available(): + model = model.cuda() + assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder) - if False: # apex_available: + if False: # apex_available: model = apex.amp.initialize(model, opt_level="O2") with tempfile.TemporaryDirectory() as tmpdir: # Generate filename in the temporary directory. diff --git a/tests/collections/tts/test_waveglow.py b/tests/collections/tts/test_waveglow.py new file mode 100644 index 000000000000..dba8b71e8751 --- /dev/null +++ b/tests/collections/tts/test_waveglow.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +from unittest import TestCase + +import pytest +from omegaconf import DictConfig + +from nemo.collections.tts.models import WaveGlowModel +from nemo.collections.tts.modules import WaveGlowModule + +wcfg = DictConfig( + { + "n_flows": 12, + "n_group": 8, + "n_mel_channels": 80, + "n_early_every": 4, + "n_early_size": 2, + "n_wn_channels": 512, + "n_wn_layers": 8, + "wn_kernel_size": 3, + } +) + + +class TestWaveGlow: + @pytest.mark.run_only_on('GPU') + @pytest.mark.unit + def test_export_to_onnx(self): + model = WaveGlowModule(**wcfg).cuda().half() + with tempfile.TemporaryDirectory() as tmpdir: + # Generate filename in the temporary directory. + tmp_file_name = os.path.join("waveglow.onnx") + # Test export. + model.export(tmp_file_name, check_trace=False) From be41d5cebbc1fa51313eb78cf3bcbefd684026a4 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Fri, 28 Aug 2020 15:34:18 -0700 Subject: [PATCH 02/12] reduce ci time (#1087) Signed-off-by: Yang Zhang --- Jenkinsfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 2b2ded1794fc..87eb554663be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -247,6 +247,7 @@ pipeline { model.train_ds.batch_size=8 \ model.validation_ds.batch_size=8 \ trainer.max_epochs=1 \ + +trainer.max_steps=1 \ model.language_model.pretrained_model_name=bert-base-uncased \ model.dataset.version_2_with_negative=false \ trainer.precision=16 \ @@ -267,6 +268,7 @@ pipeline { model.train_ds.batch_size=8 \ model.validation_ds.batch_size=8 \ trainer.max_epochs=1 \ + +trainer.max_steps=1 \ model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ model.language_model.pretrained_model_name=bert-base-uncased \ model.dataset.version_2_with_negative=true \ @@ -323,6 +325,7 @@ pipeline { model.validation_ds.batch_size=4 \ trainer.distributed_backend=ddp \ trainer.max_epochs=1 \ + +trainer.max_steps=1 \ model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ model.language_model.pretrained_model_name=megatron-bert-345m-uncased \ model.dataset.version_2_with_negative=true \ @@ -356,6 +359,7 @@ pipeline { model.train_ds.batch_size=8 \ model.validation_ds.batch_size=8 \ trainer.max_epochs=1 \ + +trainer.max_steps=1 \ model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ model.dataset.do_lower_case=false \ model.language_model.pretrained_model_name=roberta-base \ From 26941ea51e85a48f76ffcdd589f887eae4751995 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 28 Aug 2020 15:50:11 -0700 Subject: [PATCH 03/12] Add MatchboxNet support for Subset Classification task (#1085) * Finalize configs Signed-off-by: smajumdar * Patch testing and add pretrained model for subset task Signed-off-by: smajumdar * Style fix Signed-off-by: smajumdar --- examples/asr/conf/matchboxnet_3x1x64_v1.yaml | 62 ++++++++++--------- examples/asr/conf/matchboxnet_3x1x64_v2.yaml | 61 +++++++++--------- .../asr/models/classification_models.py | 8 +++ nemo/core/classes/modelPT.py | 21 +++++-- 4 files changed, 88 insertions(+), 64 deletions(-) diff --git a/examples/asr/conf/matchboxnet_3x1x64_v1.yaml b/examples/asr/conf/matchboxnet_3x1x64_v1.yaml index 08e7399c133e..13100df6f457 100644 --- a/examples/asr/conf/matchboxnet_3x1x64_v1.yaml +++ b/examples/asr/conf/matchboxnet_3x1x64_v1.yaml @@ -1,20 +1,24 @@ name: &name "MatchboxNet-3x1x64-v1" -sample_rate: &sample_rate 16000 model: - timesteps: ×teps 128 - repeat: &repeat 1 - dropout: &dropout 0.0 - kernel_size_factor: &kfactor 1.0 + sample_rate: 16000 + timesteps: 128 + repeat: 1 + dropout: 0.0 + kernel_size_factor: 1.0 - labels: &labels ['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', - 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', - 'wow', 'yes', 'zero'] + labels_full: ['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', + 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', + 'wow', 'yes', 'zero'] + + labels_subset: ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "unknown", "silence"] + + labels: ${model.labels_full} train_ds: manifest_filepath: ??? - sample_rate: *sample_rate - labels: *labels + sample_rate: ${model.sample_rate} + labels: ${model.labels} batch_size: 128 shuffle: True augmentor: @@ -29,16 +33,16 @@ model: validation_ds: manifest_filepath: ??? - sample_rate: *sample_rate - labels: *labels + sample_rate: ${model.sample_rate} + labels: ${model.labels} batch_size: 128 shuffle: False val_loss_idx: 0 test_ds: manifest_filepath: null - sample_rate: *sample_rate - labels: *labels + sample_rate: ${model.sample_rate} + labels: ${model.labels} batch_size: 128 shuffle: False test_loss_idx: 0 @@ -67,7 +71,7 @@ model: crop_or_pad_augment: cls: nemo.collections.asr.modules.CropOrPadSpectrogramAugmentation params: - audio_length: *timesteps + audio_length: ${model.timesteps} encoder: cls: nemo.collections.asr.modules.ConvASREncoder @@ -82,57 +86,57 @@ model: kernel: [11] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: false separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 64 - repeat: *repeat + repeat: ${model.repeat} kernel: [13] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: true separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 64 - repeat: *repeat + repeat: ${model.repeat} kernel: [15] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: true separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 64 - repeat: *repeat + repeat: ${model.repeat} kernel: [17] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: true separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 128 repeat: 1 kernel: [29] stride: [1] dilation: [2] - dropout: *dropout + dropout: ${model.dropout} residual: false separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: &enc_final_filters 128 repeat: 1 kernel: [1] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: false decoder: diff --git a/examples/asr/conf/matchboxnet_3x1x64_v2.yaml b/examples/asr/conf/matchboxnet_3x1x64_v2.yaml index aee24e7b303b..2c9706e8bafc 100644 --- a/examples/asr/conf/matchboxnet_3x1x64_v2.yaml +++ b/examples/asr/conf/matchboxnet_3x1x64_v2.yaml @@ -1,21 +1,24 @@ name: &name "MatchboxNet-3x1x64-v2" model: - sample_rate: &sample_rate 16000 + sample_rate: 16000 + timesteps: 128 + repeat: 1 + dropout: 0.0 + kernel_size_factor: 1.0 - timesteps: ×teps 128 - repeat: &repeat 1 - dropout: &dropout 0.0 - kernel_size_factor: &kfactor 1.0 + labels_full: ['visual', 'wow', 'learn', 'backward', 'dog', 'two', 'left', 'happy', 'nine', 'go', 'up', 'bed', 'stop', + 'one', 'zero', 'tree', 'seven', 'on', 'four', 'bird', 'right', 'eight', 'no', 'six', 'forward', 'house', + 'marvin', 'sheila', 'five', 'off', 'three', 'down', 'cat', 'follow', 'yes'] - labels: &labels ['visual', 'wow', 'learn', 'backward', 'dog', 'two', 'left', 'happy', 'nine', 'go', 'up', 'bed', 'stop', - 'one', 'zero', 'tree', 'seven', 'on', 'four', 'bird', 'right', 'eight', 'no', 'six', 'forward', 'house', - 'marvin', 'sheila', 'five', 'off', 'three', 'down', 'cat', 'follow', 'yes'] + labels_subset: ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "unknown", "silence"] + + labels: ${model.labels_full} train_ds: manifest_filepath: ??? - sample_rate: *sample_rate - labels: *labels + sample_rate: ${model.sample_rate} + labels: ${model.labels} batch_size: 128 shuffle: True augmentor: @@ -30,16 +33,16 @@ model: validation_ds: manifest_filepath: ??? - sample_rate: *sample_rate - labels: *labels + sample_rate: ${model.sample_rate} + labels: ${model.labels} batch_size: 128 shuffle: False val_loss_idx: 0 test_ds: manifest_filepath: null - sample_rate: *sample_rate - labels: *labels + sample_rate: ${model.sample_rate} + labels: ${model.labels} batch_size: 128 shuffle: False test_loss_idx: 0 @@ -68,7 +71,7 @@ model: crop_or_pad_augment: cls: nemo.collections.asr.modules.CropOrPadSpectrogramAugmentation params: - audio_length: *timesteps + audio_length: ${model.timesteps} encoder: cls: nemo.collections.asr.modules.ConvASREncoder @@ -83,57 +86,57 @@ model: kernel: [11] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: false separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 64 - repeat: *repeat + repeat: ${model.repeat} kernel: [13] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: true separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 64 - repeat: *repeat + repeat: ${model.repeat} kernel: [15] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: true separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 64 - repeat: *repeat + repeat: ${model.repeat} kernel: [17] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: true separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: 128 repeat: 1 kernel: [29] stride: [1] dilation: [2] - dropout: *dropout + dropout: ${model.dropout} residual: false separable: true - kernel_size_factor: *kfactor + kernel_size_factor: ${model.kernel_size_factor} - filters: &enc_final_filters 128 repeat: 1 kernel: [1] stride: [1] dilation: [1] - dropout: *dropout + dropout: ${model.dropout} residual: false decoder: diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py index c92f110b9382..f586d440bb16 100644 --- a/nemo/collections/asr/models/classification_models.py +++ b/nemo/collections/asr/models/classification_models.py @@ -148,6 +148,14 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: "which obtains 97.29% accuracy on test set.", ) result.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="MatchboxNet-3x1x64-v2-subset-task", + location="https://nemo-public.s3.us-east-2.amazonaws.com/nemo-1.0.0alpha-tests/MatchboxNet-3x2x64-v2-subset-task.nemo", + description="MatchboxNet model trained on Google Speech Commands dataset (v2, 10+2 classes) " + "which obtains 98.4% accuracy on test set.", + ) + result.append(model) return result @property diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index aa2334edd669..eb64e5348c93 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -758,17 +758,17 @@ def prepare_test(self, trainer: 'Trainer') -> bool: # Replace ddp multi-gpu until PTL has a fix DDP_WARN = """\n\nDuring testing, it is currently advisable to construct a new Trainer " - "with single GPU and no DDP to obtain accurate results.\n" - "Following pattern should be used: \n" - "gpu = 1 if cfg.trainer.gpus != 0 else 0\n" - "trainer = Trainer(gpus=gpu)\n" - "if model.prepare_test(trainer):\n" + "with single GPU and no DDP to obtain accurate results. + "Following pattern should be used: " + "gpu = 1 if cfg.trainer.gpus != 0 else 0" + "trainer = Trainer(gpus=gpu)" + "if model.prepare_test(trainer):" " trainer.test(model)\n\n""" if trainer is not None: if trainer.num_gpus > 1: logging.warning(DDP_WARN) - return True + return False # Assign trainer to the model self.set_trainer(trainer) @@ -787,6 +787,15 @@ def set_trainer(self, trainer: 'Trainer'): def num_weights(self): return sum(p.numel() for p in self.parameters() if p.requires_grad) + @property + def cfg(self): + return self._cfg + + @cfg.setter + def cfg(self, cfg): + self._cfg = cfg + self._set_hparams(cfg) + @staticmethod def __make_nemo_file_from_folder(filename, source_dir): with tarfile.open(filename, "w:gz") as tar: From d55bea7cb763477610d199db71cf6a0fc674dc34 Mon Sep 17 00:00:00 2001 From: fayejf <36722593+fayejf@users.noreply.github.com> Date: Fri, 28 Aug 2020 23:26:17 -0700 Subject: [PATCH 04/12] Port VAD tutorial 06 (#1076) * port vad train Signed-off-by: fayejf * style fix Signed-off-by: fayejf * Copy freesound scripts. Modify process vad script Signed-off-by: fayejf * update copyright Signed-off-by: fayejf * merge train scripts Signed-off-by: fayejf * update vad process for librosa==0.8.0 Signed-off-by: fayejf * update vad process for librosa==0.8.0 clean Signed-off-by: fayejf * add 06 vad tutorial Signed-off-by: fayejf * add 06 nb vad. might need update link Signed-off-by: fayejf * fix style Signed-off-by: fayejf * update metadata Signed-off-by: fayejf * fix colab cell type Signed-off-by: fayejf * modify process data scripts for tutorial to be faster running in Colab Signed-off-by: fayejf * fix script link Signed-off-by: fayejf * clean Signed-off-by: fayejf * few updates based on feedback Signed-off-by: fayejf * style fix Signed-off-by: fayejf --- examples/asr/speech_to_label.py | 2 +- scripts/process_vad_data.py | 45 +- .../asr/06_Voice_Activiy_Detection.ipynb | 1153 +++++++++++++++++ 3 files changed, 1183 insertions(+), 17 deletions(-) create mode 100644 tutorials/asr/06_Voice_Activiy_Detection.ipynb diff --git a/examples/asr/speech_to_label.py b/examples/asr/speech_to_label.py index 06a2e9a224a4..be382c282d18 100644 --- a/examples/asr/speech_to_label.py +++ b/examples/asr/speech_to_label.py @@ -56,7 +56,7 @@ --background_data_root= \ --rebalance_method=<'under' or 'over' of 'fixed'> \ --log - (Optional --generate (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --generates) + (Optional --demo (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --demo) ``` ## Train to convergence diff --git a/scripts/process_vad_data.py b/scripts/process_vad_data.py index 120a75775999..ba98113d9bb2 100644 --- a/scripts/process_vad_data.py +++ b/scripts/process_vad_data.py @@ -18,9 +18,9 @@ --out_dir= \ --speech_data_root= \ --background_data_root= \ - --rebalance_method=<'under' or 'over' of 'fixed'> \ + --rebalance_method=<'under' or 'over' or 'fixed'> \ --log - (Optional --generate (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --generates) + (Optional --demo (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --demo) """ import argparse import glob @@ -32,6 +32,7 @@ import librosa import numpy as np +import soundfile as sf from sklearn.model_selection import train_test_split sr = 16000 @@ -77,12 +78,19 @@ def __extract_all_files(filepath: str, data_root: str, data_dir: str): logging.info(f'Skipping extracting. Data already there {data_dir}') -def split_train_val_test(data_dir, file_type, test_size=0.1, val_size=0.1): +def split_train_val_test(data_dir, file_type, test_size=0.1, val_size=0.1, demo=False): X = [] if file_type == "speech": for o in os.listdir(data_dir): if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_": X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav')) + + if demo: + logging.info( + f"For Demonstration, we use {int(len(X)/100)}/{len(X)} speech data. Make sure to remove --demo flag when you actually train your model!" + ) + X = np.random.choice(X, int(len(X) / 100), replace=False) + else: for o in os.listdir(data_dir): if os.path.isdir(os.path.join(data_dir, o)): @@ -305,20 +313,24 @@ def generate_variety_noise(data_dir, filename, prefix): for file in files: y, sr = librosa.load(file, sr=sampling_rate) - for i in range(0, len(y) - sampling_rate, silence_stride): + + for i in range( + 0, len(y) - sampling_rate, silence_stride * 100 + ): # stride * 100 to generate less samples for demo file_name = "{}_{}.wav".format(file.split("/")[-1], i) y_slice = y[i : i + sampling_rate] magnitude = rng.uniform(0.0, 1.0) y_slice *= magnitude out_file_path = os.path.join(silence_path, file_name) - librosa.output.write_wav(out_file_path, y_slice, sr) + sf.write(out_file_path, y_slice, sr) + silence_files.append(out_file_path) new_list_file = os.path.join(silence_path, filename) with open(new_list_file, "w") as outfile: outfile.write("\n".join(silence_files)) - logging.info(f"Generate more background for {file_path}. => {new_list_file} !") + logging.info(f"Generate {len(out_file_path)} background files for {file_path}. => {new_list_file} !") return len(silence_files) @@ -329,9 +341,10 @@ def main(): parser.add_argument("--background_data_root", required=True, default=None, type=str) parser.add_argument('--test_size', required=False, default=0.1, type=float) parser.add_argument('--val_size', required=False, default=0.1, type=float) + parser.add_argument('--seg_len', required=False, default=0.63, type=float) parser.add_argument('--log', required=False, action='store_true') parser.add_argument('--rebalance_method', required=False, default=None, type=str) - parser.add_argument('--generate', required=False, action='store_true') + parser.add_argument('--demo', required=False, action='store_true') parser.set_defaults(log=False, generate=False) args = parser.parse_args() @@ -364,7 +377,7 @@ def main(): logging.info(f"Split speech data!") # dataset provide testing.txt and validation.txt feel free to split data using that with process_google_speech_train - split_train_val_test(speech_data_folder, "speech", args.test_size, args.val_size) + split_train_val_test(speech_data_folder, "speech", args.test_size, args.val_size, args.demo) logging.info(f"Split background data!") split_train_val_test(background_data_folder, "background", args.test_size, args.val_size) @@ -374,13 +387,13 @@ def main(): # Process Speech manifest logging.info(f"=== Write speech data to manifest!") skip_num_val, speech_seg_num_val, speech_val = load_list_write_manifest( - speech_data_folder, out_dir, 'validation_list.txt', 'speech', 0.2, 0.8, 0.63, 0.63 + speech_data_folder, out_dir, 'validation_list.txt', 'speech', 0.2, 0.8, args.seg_len, args.seg_len ) skip_num_test, speech_seg_num_test, speech_test = load_list_write_manifest( - speech_data_folder, out_dir, 'testing_list.txt', 'speech', 0.2, 0.8, 0.01, 0.63 + speech_data_folder, out_dir, 'testing_list.txt', 'speech', 0.2, 0.8, 0.01, args.seg_len ) skip_num_train, speech_seg_num_train, speech_train = load_list_write_manifest( - speech_data_folder, out_dir, 'training_list.txt', 'speech', 0.2, 0.8, 0.63, 0.63 + speech_data_folder, out_dir, 'training_list.txt', 'speech', 0.2, 0.8, args.seg_len, args.seg_len ) logging.info(f'Val: Skip {skip_num_val} samples. Get {speech_seg_num_val} segments! => {speech_val} ') @@ -389,7 +402,7 @@ def main(): # Process background manifest # if we select to generate more background noise data - if args.generate: + if args.demo: logging.info("Start generating more background noise data") generate_variety_noise(background_data_folder, 'validation_list.txt', 'background') generate_variety_noise(background_data_folder, 'training_list.txt', 'background') @@ -400,13 +413,13 @@ def main(): logging.info(f"=== Write background data to manifest!") skip_num_val, background_seg_num_val, background_val = load_list_write_manifest( - background_data_folder, out_dir, 'validation_list.txt', 'background', 0, None, 0.15, 0.63 + background_data_folder, out_dir, 'validation_list.txt', 'background', 0, None, 0.15, args.seg_len ) skip_num_test, background_seg_num_test, background_test = load_list_write_manifest( - background_data_folder, out_dir, 'testing_list.txt', 'background', 0, None, 0.01, 0.63 + background_data_folder, out_dir, 'testing_list.txt', 'background', 0, None, 0.01, args.seg_len ) skip_num_train, background_seg_num_train, background_train = load_list_write_manifest( - background_data_folder, out_dir, 'training_list.txt', 'background', 0, None, 0.15, 0.63 + background_data_folder, out_dir, 'training_list.txt', 'background', 0, None, 0.15, args.seg_len ) logging.info(f'Val: Skip {skip_num_val} samples. Get {background_seg_num_val} segments! => {background_val}') @@ -454,7 +467,7 @@ def main(): rebalance_json(out_dir, speech_train, max_train, 'balanced') if args.rebalance_method == 'fixed': - fixed_test, fixed_val, fixed_train = 1000, 1000, 5000 + fixed_test, fixed_val, fixed_train = 200, 100, 500 logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.") logging.info(f'Val: {fixed_val} Test: {fixed_test} Train: {fixed_train}!') diff --git a/tutorials/asr/06_Voice_Activiy_Detection.ipynb b/tutorials/asr/06_Voice_Activiy_Detection.ipynb new file mode 100644 index 000000000000..9b9302fcbe17 --- /dev/null +++ b/tutorials/asr/06_Voice_Activiy_Detection.ipynb @@ -0,0 +1,1153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "R12Yn6W1dt9t" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "\n", + "## Install dependencies\n", + "!pip install wget\n", + "!apt-get install sox libsndfile1 ffmpeg\n", + "!pip install unidecode\n", + "\n", + "# ## Install NeMo\n", + "!python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]\n", + "\n", + "## Install TorchAudio\n", + "!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html\n", + "\n", + "## Grab the config we'll use in this example\n", + "!mkdir configs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This VAD tutorial is based on the MatchboxNet model from the paper \"[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)\" with a modified decoder head to suit classification tasks.\n", + "\n", + "The notebook will follow the steps below:\n", + "\n", + " - Dataset preparation: Instruction of downloading datasets. And how to convert it to a format suitable for use with nemo_asr\n", + " - Audio preprocessing (feature extraction): signal normalization, windowing, (log) spectrogram (or mel scale spectrogram, or MFCC)\n", + "\n", + " - Data augmentation using SpecAugment \"[SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779)\" to increase number of data samples.\n", + " \n", + " - Develop a small Neural classification model which can be trained efficiently.\n", + " \n", + " - Model training on the Google Speech Commands dataset and Freesound dataset in NeMo.\n", + " \n", + " - Evaluation of error cases of the model by audibly hearing the samples\n", + " \n", + " - Add more evaluation metrics and transfer learning/fine tune\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "I62_LJzc-p2b" + }, + "outputs": [], + "source": [ + "# Some utility imports\n", + "import os\n", + "from omegaconf import OmegaConf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab": {}, + "colab_type": "text", + "id": "K_M8wpkwd7d7" + }, + "source": [ + "# Data Preparation\n", + "\n", + "## Download the background data\n", + "We suggest to use the background categories of [freesound](https://freesound.org/) dataset as our non-speech/background data. \n", + "We provide scripts for downloading and resampling it. Please have a look at [NeMo docs VAD Data Preparation]( https://docs.nvidia.com/deeplearning/nemo/developer_guide/en/v0.11.0/voice_activity_detection/tutorial.html#data-preparation). Note that downloading this dataset may takes hours. \n", + "\n", + "**NOTE:** Here, this tutorial serves as a demonstration on how to train and evaluate models for vad using NeMo. We avoid using freesound dataset, and use `_background_noise_` category in Google Speech Commands Dataset as non-speech/background data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the speech data\n", + " \n", + "We will use the open source Google Speech Commands Dataset (we will use V2 of the dataset for the tutorial, but require very minor changes to support V1 dataset) as our speech data. Google Speech Commands Dataset V2 will take roughly 6GB disk space. These scripts below will download the dataset and convert it to a format suitable for use with nemo_asr.\n", + "\n", + "\n", + "**NOTE**: You may additionally pass `--test_size` or `--val_size` flag for spliting train val and test data.\n", + "You may additionally pass `--seg_len` flag for indicating the segment length. Dafault is 0.63s.\n", + "\n", + "**NOTE**: You may additionally pass a `--rebalance_method='fixed|over|under'` at the end of the script to rebalance the class samples in the manifest. \n", + "* 'fixed': Fixed number of sample for each class. For example, train 500, val 100, and test 200. (Change number in script if you want)\n", + "* 'over': Oversampling rebalance method\n", + "* 'under': Undersampling rebalance method\n", + "\n", + "**NOTE**: We only take a samll subset of speech data for demonstration, if you want to use entire speech data. Don't forget to **delete `--demo`** and change rebalance method/number. `_background_noise_` category only has **6** audio files. So we would like to generate more based on the audio files to enlarge our background training data. If you want to use your own background noise data, just change the `background_data_root` and **delete `--demo`**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp = 'src'\n", + "data_folder = 'data'\n", + "if not os.path.exists(tmp):\n", + " os.makedirs(tmp)\n", + "if not os.path.exists(data_folder):\n", + " os.makedirs(data_folder)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "script = os.path.join(tmp, 'process_vad_data.py')\n", + "if not os.path.exists(script):\n", + " !wget -P $tmp https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/process_vad_data.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "speech_data_root = os.path.join(data_folder, 'google_dataset_v2')\n", + "background_data_root = os.path.join(data_folder, 'google_dataset_v2/google_speech_recognition_v2/_background_noise_')# your \n", + "out_dir = os.path.join(data_folder, 'manifest')\n", + "if not os.path.exists(speech_data_root):\n", + " os.mkdir(speech_data_root)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This may take a few minutes\n", + "!python $script \\\n", + " --out_dir={out_dir} \\\n", + " --speech_data_root={speech_data_root} \\\n", + " --background_data_root={background_data_root}\\\n", + " --log \\\n", + " --demo \\\n", + " --rebalance_method='fixed' " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "TTsxp0nZ1zqo" + }, + "source": [ + "## Preparing the manifest file\n", + "\n", + "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", + "\n", + "1) `audio_filepath`: Refers to the path to the raw audio file
\n", + "2) `label`: The class label (speech or background) of this sample
\n", + "3) `duration`: The length of the audio file, in seconds.
\n", + "4) `offset`: The start of the segment, in seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ytTFGVe0g9wk" + }, + "outputs": [], + "source": [ + "# change below if you don't have or don't want to use rebalanced data\n", + "train_dataset = 'data/manifest/balanced_background_training_manifest.json,data/manifest/balanced_speech_training_manifest.json' \n", + "val_dataset = 'data/manifest/background_validation_manifest.json,data/manifest/speech_validation_manifest.json' \n", + "test_dataset = 'data/manifest/balanced_background_testing_manifest.json,data/manifest/balanced_speech_testing_manifest.json' " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s0SZy9SEhOBf" + }, + "source": [ + "## Read a few rows of the manifest file \n", + "\n", + "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", + "\n", + "1) `audio_filepath`: Refers to the path to the raw audio file
\n", + "2) `command`: The class label (or speech command) of this sample
\n", + "3) `duration`: The length of the audio file, in seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_test_dataset = test_dataset.split(',')[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HYBidCMIhKQV", + "scrolled": true + }, + "outputs": [], + "source": [ + "!head -n 5 {sample_test_dataset}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training - Preparation\n", + "\n", + "We will be training a MatchboxNet model from paper \"[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)\" evolved from [QuartzNet](https://arxiv.org/pdf/1910.10261.pdf) model. The benefit of QuartzNet over JASPER models is that they use Separable Convolutions, which greatly reduce the number of parameters required to get good model accuracy.\n", + "\n", + "MatchboxNet models generally follow the model definition pattern QuartzNet-[BxRXC], where B is the number of blocks, R is the number of convolutional sub-blocks, and C is the number of channels in these blocks. Each sub-block contains a 1-D masked convolution, batch normalization, ReLU, and dropout.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ieAPOM9thTN2" + }, + "outputs": [], + "source": [ + "# NeMo's \"core\" package\n", + "import nemo\n", + "# NeMo's ASR collection - this collections contains complete ASR models and\n", + "# building blocks (modules) for ASR\n", + "import nemo.collections.asr as nemo_asr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ss9gLcDv30jI" + }, + "source": [ + "## Model Configuration\n", + "The MatchboxNet Model is defined in a config file which declares multiple important sections.\n", + "\n", + "They are:\n", + "\n", + "1) `model`: All arguments that will relate to the Model - preprocessors, encoder, decoder, optimizer and schedulers, datasets and any other related information\n", + "\n", + "2) `trainer`: Any argument to be passed to PyTorch Lightning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_CONFIG = \"matchboxnet_3x1x64_vad.yaml\"\n", + "\n", + "if not os.path.exists(f\"configs/{MODEL_CONFIG}\"):\n", + " !wget -P configs/ \"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/{MODEL_CONFIG}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yoVAs9h1lfci", + "scrolled": true + }, + "outputs": [], + "source": [ + "# This line will print the entire config of the MatchboxNet model\n", + "config_path = f\"configs/{MODEL_CONFIG}\"\n", + "config = OmegaConf.load(config_path)\n", + "print(config.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "m2lJPR0a3qww" + }, + "outputs": [], + "source": [ + "# Preserve some useful parameters\n", + "labels = config.model.labels\n", + "sample_rate = config.sample_rate" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8_pmjeed78rJ" + }, + "source": [ + "### Setting up the datasets within the config\n", + "\n", + "If you'll notice, there are a few config dictionaries called `train_ds`, `validation_ds` and `test_ds`. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DIe6Qfs18MiQ" + }, + "outputs": [], + "source": [ + "print(config.model.train_ds.pretty())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Fb01hl868Uc3" + }, + "source": [ + "### `???` inside configs\n", + "\n", + "You will often notice that some configs have `???` in place of paths. This is used as a placeholder so that the user can change the value at a later time.\n", + "\n", + "Let's add the paths to the manifests to the config above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "m181HXev8T97" + }, + "outputs": [], + "source": [ + "config.model.train_ds.manifest_filepath = train_dataset\n", + "config.model.validation_ds.manifest_filepath = val_dataset\n", + "config.model.test_ds.manifest_filepath = test_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pbXngoCM5IRG" + }, + "source": [ + "## Building the PyTorch Lightning Trainer\n", + "\n", + "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem!\n", + "\n", + "Lets first instantiate a Trainer object!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bYtvdBlG5afU" + }, + "outputs": [], + "source": [ + "import torch\n", + "import pytorch_lightning as pl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jRN18CdH51nN" + }, + "outputs": [], + "source": [ + "print(\"Trainer config - \\n\")\n", + "print(config.trainer.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gHf6cHvm6H9b" + }, + "outputs": [], + "source": [ + "# Lets modify some trainer configs for this demo\n", + "# Checks if we have GPU available and uses it\n", + "cuda = 1 if torch.cuda.is_available() else 0\n", + "config.trainer.gpus = cuda\n", + "\n", + "# Reduces maximum number of epochs to 5 for quick demonstration\n", + "config.trainer.max_epochs = 5\n", + "\n", + "# Remove distributed training flags\n", + "config.trainer.distributed_backend = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "UB9nr7G56G3L" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(**config.trainer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2wt603Vq6sqX" + }, + "source": [ + "## Setting up a NeMo Experiment\n", + "\n", + "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it ! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TfWJFg7p6Ezf" + }, + "outputs": [], + "source": [ + "from nemo.utils.exp_manager import exp_manager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "SC-QPoW44-p2" + }, + "outputs": [], + "source": [ + "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Yqi6rkNR7Dph" + }, + "outputs": [], + "source": [ + "# The exp_dir provides a path to the current experiment for easy access\n", + "exp_dir = str(exp_dir)\n", + "exp_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "t0zz-vHH7Uuh" + }, + "source": [ + "## Building the MatchboxNet Model\n", + "\n", + "MatchboxNet is an ASR model with a classification task - it generates one label for the entire provided audio stream. Therefore we encapsulate it inside the `EncDecClassificationModel` as follows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FRMrKhyf5vhy" + }, + "outputs": [], + "source": [ + "asr_model = nemo_asr.models.EncDecClassificationModel(cfg=config.model, trainer=trainer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jA9UND-Q_oyw" + }, + "source": [ + "# Training a MatchboxNet Model\n", + "\n", + "As MatchboxNet is inherently a PyTorch Lightning Model, it can easily be trained in a single line - `trainer.fit(model)` !\n", + "\n", + "\n", + "# Training the model\n", + "\n", + "Even with such a small model (73k parameters), and just 5 epochs (should take just a few minutes to train), you should be able to get a test set accuracy score around 98.83% (this result is for the [freesound](https://freesound.org/) dataset) with enough training data. \n", + "\n", + "**NOTE:** If you follow our tutorial and user the generated background data, you may notice the below results are acceptable, but please remember, this tutorial is only for **demostration** and the dataset is not good enough. Please change background dataset and train with enough data for improvement!\n", + "\n", + "Experiment with increasing the number of epochs or with batch size to see how much you can improve the score! \n", + "\n", + "**NOTE:** Noise rebustness is quite important for VAD task. Below we list the augmentation we used in this demo. \n", + "Please refer to [05_Online_Noise_Augmentation.ipynb](https://github.com/NVIDIA/NeMo/blob/candidate/tutorials/asr/05_Online_Noise_Augmentation.ipynb) for understanding noise augmentation in NeMo.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Noise augmentation\n", + "print(config.model.train_ds.augmentor.pretty()) # noise augmentation\n", + "print(config.model.spec_augment.pretty()) # SpecAug data augmentation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are interested in **pretrained** model, please have a look at [Transfer Leaning & Fine-tuning on a new dataset](#Transfer-Leaning-&-Fine-tuning-on-a-new-dataset) and [Offline_and_Online_VAD_Demo](https://github.com/NVIDIA/NeMo/blob/candidate/tutorials/asr/07_Online_Voice_Activity_Detection_Demo.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3ngKcRFqBfIF" + }, + "source": [ + "### Monitoring training progress\n", + "\n", + "Before we begin training, lets first create a Tensorboard visualization to monitor progress\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Cyfec0PDBsXa" + }, + "outputs": [], + "source": [ + "# Load the TensorBoard notebook extension\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4L5ymu-QBxmz" + }, + "outputs": [], + "source": [ + "%tensorboard --logdir {exp_dir}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZApuELDIKQgC" + }, + "source": [ + "### Training for 5 epochs\n", + "We see below that the model begins to get modest scores on the validation set after just 5 epochs of training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9xiUUJlH5KdD" + }, + "outputs": [], + "source": [ + "trainer.fit(asr_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fast Training\n", + "\n", + "We can dramatically improve the time taken to train this model by using Multi GPU training along with Mixed Precision.\n", + "\n", + "For multi-GPU training, take a look at [the PyTorch Lightning Multi-GPU training section](https://pytorch-lightning.readthedocs.io/en/latest/multi_gpu.html)\n", + "\n", + "For mixed-precision training, take a look at [the PyTorch Lightning Mixed-Precision training section](https://pytorch-lightning.readthedocs.io/en/latest/apex.html)\n", + "\n", + "```python\n", + "# Mixed precision:\n", + "trainer = Trainer(amp_level='O1', precision=16)\n", + "\n", + "# Trainer with a distributed backend:\n", + "trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", + "\n", + "# Of course, you can combine these flags as well.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Dkds1jSvKgSc" + }, + "source": [ + "# Evaluation\n", + "\n", + "## Evaluation on the Test set\n", + "\n", + "Lets compute the final score on the test set via `trainer.test(model)`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mULTrhEJ_6wV", + "scrolled": true + }, + "outputs": [], + "source": [ + "trainer.test(asr_model, ckpt_path=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ifDHkunjM8y6" + }, + "source": [ + "## Evaluation of incorrectly predicted samples\n", + "\n", + "Given that we have a trained model, which performs reasonably well, let's try to listen to the samples where the model is least confident in its predictions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PcJrZ72sNCkM" + }, + "source": [ + "### Extract the predictions from the model\n", + "\n", + "We want to possess the actual logits of the model instead of just the final evaluation score, so we can define a function to perform the forward step for us without computing the final loss. Instead, we extract the logits per batch of samples provided." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rvxdviYtOFjK" + }, + "source": [ + "### Accessing the data loaders\n", + "\n", + "We can utilize the `setup_test_data` method in order to instantiate a data loader for the dataset we want to analyze.\n", + "\n", + "For convenience, we can access these instantiated data loaders using the following accessors - `asr_model._train_dl`, `asr_model._validation_dl` and `asr_model._test_dl`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CB0QZCAmM656" + }, + "outputs": [], + "source": [ + "asr_model.setup_test_data(config.model.test_ds)\n", + "test_dl = asr_model._test_dl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rA7gXawcPoip" + }, + "source": [ + "### Partial Test Step\n", + "\n", + "Below we define a utility function to perform most of the test step. For reference, the test step is defined as follows:\n", + "\n", + "```python\n", + " def test_step(self, batch, batch_idx, dataloader_idx=0):\n", + " audio_signal, audio_signal_len, labels, labels_len = batch\n", + " logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)\n", + " loss_value = self.loss(logits=logits, labels=labels)\n", + " correct_counts, total_counts = self._accuracy(logits=logits, labels=labels)\n", + " return {'test_loss': loss_value, 'test_correct_counts': correct_counts, 'test_total_counts': total_counts}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sBsDOm5ROpQI" + }, + "outputs": [], + "source": [ + "@torch.no_grad()\n", + "def extract_logits(model, dataloader):\n", + " logits_buffer = []\n", + " label_buffer = []\n", + "\n", + " # Follow the above definition of the test_step\n", + " for batch in dataloader:\n", + " audio_signal, audio_signal_len, labels, labels_len = batch\n", + " logits = model(input_signal=audio_signal, input_signal_length=audio_signal_len)\n", + "\n", + " logits_buffer.append(logits)\n", + " label_buffer.append(labels)\n", + " print(\".\", end='')\n", + " print()\n", + "\n", + " print(\"Finished extracting logits !\")\n", + " logits = torch.cat(logits_buffer, 0)\n", + " labels = torch.cat(label_buffer, 0)\n", + " return logits, labels\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mZSdprUlOuoV" + }, + "outputs": [], + "source": [ + "cpu_model = asr_model.cpu()\n", + "cpu_model.eval()\n", + "logits, labels = extract_logits(cpu_model, test_dl)\n", + "print(\"Logits:\", logits.shape, \"Labels :\", labels.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9Wd0ukgNXRBz", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Compute accuracy - `_accuracy` is a PyTorch Lightning Metric !\n", + "correct_count, total_count = cpu_model._accuracy(logits=logits, labels=labels)\n", + "print(\"Accuracy : \", float(correct_count * 100.) / float(total_count))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NwN9OSqCauSH" + }, + "source": [ + "### Filtering out incorrect samples\n", + "Let us now filter out the incorrectly labeled samples from the total set of samples in the test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "N1YJvsmcZ0uE" + }, + "outputs": [], + "source": [ + "import librosa\n", + "import json\n", + "import IPython.display as ipd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jZAT9yGAayvR" + }, + "outputs": [], + "source": [ + "# First lets create a utility class to remap the integer class labels to actual string label\n", + "class ReverseMapLabel:\n", + " def __init__(self, data_loader):\n", + " self.label2id = dict(data_loader.dataset.label2id)\n", + " self.id2label = dict(data_loader.dataset.id2label)\n", + "\n", + " def __call__(self, pred_idx, label_idx):\n", + " return self.id2label[pred_idx], self.id2label[label_idx]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "X3GSXvYHa4KJ" + }, + "outputs": [], + "source": [ + "# Next, lets get the indices of all the incorrectly labeled samples\n", + "sample_idx = 0\n", + "incorrect_preds = []\n", + "rev_map = ReverseMapLabel(test_dl)\n", + "\n", + "# Remember, evaluated_tensor = (loss, logits, labels)\n", + "probs = torch.softmax(logits, dim=-1)\n", + "probas, preds = torch.max(probs, dim=-1)\n", + "\n", + "incorrect_ids = (preds != labels).nonzero()\n", + "for idx in incorrect_ids:\n", + " proba = float(probas[idx][0])\n", + " pred = int(preds[idx][0])\n", + " label = int(labels[idx][0])\n", + " idx = int(idx[0]) + sample_idx\n", + "\n", + " incorrect_preds.append((idx, *rev_map(pred, label), proba))\n", + "\n", + "print(f\"Num test samples : {total_count.item()}\")\n", + "print(f\"Num errors : {len(incorrect_preds)}\")\n", + "\n", + "# First lets sort by confidence of prediction\n", + "incorrect_preds = sorted(incorrect_preds, key=lambda x: x[-1], reverse=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0JgGo71gcDtD" + }, + "source": [ + "### Examine a subset of incorrect samples\n", + "Let's print out the (test id, predicted label, ground truth label, confidence) tuple of first 20 incorrectly labeled samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "x37wNJsNbcw0" + }, + "outputs": [], + "source": [ + "for incorrect_sample in incorrect_preds[:20]:\n", + " print(str(incorrect_sample))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tDnwYsDKcLv9" + }, + "source": [ + "### Define a threshold below which we designate a model's prediction as \"low confidence\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dpvzeh4PcGJs" + }, + "outputs": [], + "source": [ + "# Filter out how many such samples exist\n", + "low_confidence_threshold = 0.8\n", + "count_low_confidence = len(list(filter(lambda x: x[-1] <= low_confidence_threshold, incorrect_preds)))\n", + "print(f\"Number of low confidence predictions : {count_low_confidence}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ERXyXvCAcSKR" + }, + "source": [ + "### Lets hear the samples which the model has least confidence in !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kxjNVjX8cPNP" + }, + "outputs": [], + "source": [ + "# First lets create a helper function to parse the manifest files\n", + "def parse_manifest(manifest):\n", + " data = []\n", + " for line in manifest:\n", + " line = json.loads(line)\n", + " data.append(line)\n", + "\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IWxqw5k-cUVd" + }, + "outputs": [], + "source": [ + "# Next, lets create a helper function to actually listen to certain samples\n", + "def listen_to_file(sample_id, pred=None, label=None, proba=None):\n", + " # Load the audio waveform using librosa\n", + " filepath = test_samples[sample_id]['audio_filepath']\n", + " audio, sample_rate = librosa.load(filepath,\n", + " offset = test_samples[sample_id]['offset'],\n", + " duration = test_samples[sample_id]['duration'])\n", + "\n", + "\n", + " if pred is not None and label is not None and proba is not None:\n", + " print(f\"filepath: {filepath}, Sample : {sample_id} Prediction : {pred} Label : {label} Confidence = {proba: 0.4f}\")\n", + " else:\n", + " \n", + " print(f\"Sample : {sample_id}\")\n", + "\n", + " return ipd.Audio(audio, rate=sample_rate)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HPj1tFNIcXaU" + }, + "outputs": [], + "source": [ + "import json\n", + "# Now lets load the test manifest into memory\n", + "all_test_samples = []\n", + "for _ in test_dataset.split(','):\n", + " print(_)\n", + " with open(_, 'r') as test_f:\n", + " test_samples = test_f.readlines()\n", + " \n", + " all_test_samples.extend(test_samples)\n", + "print(len(all_test_samples))\n", + "test_samples = parse_manifest(all_test_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Nt7b_uiScZcC" + }, + "outputs": [], + "source": [ + "# Finally, lets listen to all the audio samples where the model made a mistake\n", + "# Note: This list of incorrect samples may be quite large, so you may choose to subsample `incorrect_preds`\n", + "count = min(count_low_confidence, 20) # replace this line with just `count_low_confidence` to listen to all samples with low confidence\n", + "\n", + "for sample_id, pred, label, proba in incorrect_preds[:count]:\n", + " ipd.display(listen_to_file(sample_id, pred=pred, label=label, proba=proba))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding evaluation metrics\n", + "\n", + "Here is an example of how to use more metrics (e.g. from pytorch_lightning) to evaluate your result.\n", + "\n", + "**Note:** If you would like to add metrics for training and testing, have a look at \n", + "```python\n", + "NeMo/nemo/collections/common/metrics\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pytorch_lightning.metrics.functional import confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_, pred = logits.topk(1, dim=1, largest=True, sorted=True)\n", + "confusion_matrix(pred=pred, target=labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Transfer Leaning & Fine-tuning on a new dataset\n", + "For transfer learning, please refer to [**Transfer learning** part of ASR tutorial](https://github.com/NVIDIA/NeMo/blob/candidate/tutorials/asr/01_ASR_with_NeMo.ipynb)\n", + "\n", + "More details on saving and restoring checkpoint, and exporting a model in its entirety, please refer to [**Fine-tuning on a new dataset** & **Advanced Usage parts** of Speech Command tutorial](https://github.com/NVIDIA/NeMo/blob/candidate/tutorials/asr/02_Speech_Commands.ipynb)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LyIegk2CPNsI" + }, + "source": [ + "# Inference and more\n", + "If you are interested in **pretrained** model and **streaming inference**, please have a look at our incoming tutorial 07 Offline_and_Online_VAD_Demo\n", + "\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "06_Voice_Activity_Detection.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From c2ebfd06200353c22d0526e900440574d291c534 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Sun, 30 Aug 2020 21:47:47 -0700 Subject: [PATCH 05/12] Return empty dictionaries by default to make PTL happy with null val/test dataloaders (#1089) Signed-off-by: smajumdar --- nemo/core/classes/modelPT.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index eb64e5348c93..238921637339 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -495,14 +495,14 @@ def val_dataloader(self): return self._validation_dl def validation_step(self, batch, batch_ix): - pass + return {} def test_dataloader(self): if self._test_dl is not None: return self._test_dl def test_step(self, batch, batch_ix): - pass + return {} def validation_epoch_end( self, outputs: Union[List[Dict[str, torch.Tensor]], List[List[Dict[str, torch.Tensor]]]] @@ -529,7 +529,7 @@ def validation_epoch_end( """ # Case where we dont provide data loaders if outputs is not None and len(outputs) == 0: - return + return {} # Case where we provide exactly 1 data loader if type(outputs[0]) == dict: @@ -624,7 +624,7 @@ def test_epoch_end( """ # Case where we dont provide data loaders if outputs is not None and len(outputs) == 0: - return + return {} # Case where we provide exactly 1 data loader if type(outputs[0]) == dict: From a5bc065c2a85d3f2067bdcce2c0cce10ab15690e Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Mon, 31 Aug 2020 09:43:23 -0700 Subject: [PATCH 06/12] NER tutorial - notebook (#1071) * ner inference added Signed-off-by: ekmb * clean up methods Signed-off-by: ekmb * ner notebook added Signed-off-by: ekmb * header fix Signed-off-by: ekmb * remove unused import Signed-off-by: ekmb * pretrained model link added Signed-off-by: ekmb * loss setup and data_desc separated Signed-off-by: ekmb * updates for finetuning from pretrained Signed-off-by: ekmb * move loss setup Signed-off-by: ekmb * notebook update Signed-off-by: ekmb * typos fixed, docstring added Signed-off-by: ekmb --- .../conf/token_classification_config.yaml | 10 +- .../data/import_from_iob_format.py | 124 +++ .../punctuation_capitalization.py | 4 +- .../token_classification.py | 77 +- .../punctuation_capitalization_dataset.py | 17 +- .../token_classification_dataset.py | 41 +- .../token_classification_descriptor.py | 1 + .../punctuation_capitalization_model.py | 24 +- .../token_classification_model.py | 308 ++++++- nemo/collections/nlp/parts/utils_funcs.py | 104 ++- .../nlp/Punctuation_and_Capitalization.ipynb | 44 +- ...on_Named_Entity_Recognition_tutorial.ipynb | 806 ++++++++++++++++++ 12 files changed, 1438 insertions(+), 122 deletions(-) create mode 100644 examples/nlp/token_classification/data/import_from_iob_format.py create mode 100644 tutorials/nlp/Token_Classification_Named_Entity_Recognition_tutorial.ipynb diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml index 866835228ac3..69bf876b557f 100644 --- a/examples/nlp/token_classification/conf/token_classification_config.yaml +++ b/examples/nlp/token_classification/conf/token_classification_config.yaml @@ -47,6 +47,10 @@ model: ignore_extra_tokens: false ignore_start_end: false use_cache: true + # shared among dataloaders + num_workers: 2 + pin_memory: false + drop_last: false train_ds: text_file: text_train.txt @@ -54,9 +58,6 @@ model: shuffle: true num_samples: -1 batch_size: 64 - num_workers: 2 - pin_memory: false - drop_last: false validation_ds: text_file: text_dev.txt @@ -64,9 +65,6 @@ model: shuffle: false num_samples: -1 batch_size: 64 - num_workers: 2 - pin_memory: false - drop_last: false language_model: pretrained_model_name: bert-base-uncased diff --git a/examples/nlp/token_classification/data/import_from_iob_format.py b/examples/nlp/token_classification/data/import_from_iob_format.py new file mode 100644 index 000000000000..6787fdc6a10b --- /dev/null +++ b/examples/nlp/token_classification/data/import_from_iob_format.py @@ -0,0 +1,124 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +from nemo.utils import logging + + +def __convert_data(in_file: str, out_text_f: str, out_labels_f: str, max_length: int): + """ + Convert data from the IOB format to NeMo accepted format described below. + in_file should be in the IOB format, see example here: + https://www.clips.uantwerpen.be/conll2003/ner/. + + Args: + in_file: input file name + out_text_f: output file with text + out_labels_f: output file with labels + max_length: use -1 to leave the examples' length as is, otherwise long examples will be splitted into multiple + examples + After the conversion, the dataset is splitted into 2 files: text.txt + and labels.txt. + Each line of the text.txt file contains text sequences, where words + are separated with spaces. The labels.txt file contains corresponding + labels for each word in text.txt, the labels are separated with spaces. + Each line of the files should follow the format: + [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and + [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt). + + """ + in_file = open(in_file, 'r') + + if max_length == -1: + with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels: + for line in in_file: + if line == '\n': + out_text.write(line) + out_labels.write(line) + else: + line = line.split() + out_text.write(line[0] + ' ') + out_labels.write(line[-1] + ' ') + + else: + words = [] + labels = [] + with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels: + lines = in_file.readlines() + for line_id, line in enumerate(lines): + logging.info(f"{line_id} {len(lines)}") + contends = line.strip() + if len(contends) == 0: + assert len(words) == len(labels) + if len(words) > max_length: + # split if the sentence is longer than max_length + while len(words) > max_length: + tmplabel = labels[:max_length] + for iidx in range(len(tmplabel)): + if tmplabel.pop() == 'O': + break + l = ' '.join([label for label in labels[: len(tmplabel) + 1] if len(label) > 0]) + w = ' '.join([word for word in words[: len(tmplabel) + 1] if len(word) > 0]) + + out_text.write(w + "\n") + out_labels.write(l + "\n") + words = words[len(tmplabel) + 1 :] + labels = labels[len(tmplabel) + 1 :] + + if len(words) == 0: + continue + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + + out_text.write(w + "\n") + out_labels.write(l + "\n") + words = [] + labels = [] + continue + + word = line.strip().split()[0] + label = line.strip().split()[-1] + words.append(word) + labels.append(label) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Convert data from IOB format to the format compatible with nlp/examples/token_classification.py' + ) + parser.add_argument("--data_file", required=True, type=str, help='path to a file in IOB format') + parser.add_argument( + "--max_length", + default=-1, + type=int, + help='use -1 to leave the examples\'s length as is, ' + 'otherwise long examples will be splitted into ' + 'multiple examples', + ) + args = parser.parse_args() + + data_dir = os.path.dirname(args.data_file) + basename = os.path.basename(args.data_file) + prefix, ext = os.path.splitext(basename) + if not os.path.exists(args.data_file): + raise FileNotFoundError("{data_file} not found in {data_dir}") + + logging.info(f'Processing {args.data_file}') + out_text = os.path.join(data_dir, 'text_' + prefix + '.txt') + out_labels = os.path.join(data_dir, 'labels_' + prefix + '.txt') + + __convert_data(args.data_file, out_text, out_labels, args.max_length) + logging.info(f'Processing of the {args.data_file} is complete') diff --git a/examples/nlp/token_classification/punctuation_capitalization.py b/examples/nlp/token_classification/punctuation_capitalization.py index bcdd188086c0..482a86eb1cd5 100644 --- a/examples/nlp/token_classification/punctuation_capitalization.py +++ b/examples/nlp/token_classification/punctuation_capitalization.py @@ -41,8 +41,8 @@ def main(cfg: DictConfig) -> None: inference_results = model.add_punctuation_capitalization(queries) for query, result in zip(queries, inference_results): - logging.info(f'Query : {query}') - logging.info(f'Combined: {result.strip()}\n') + logging.info(f'Query : {query}') + logging.info(f'Result: {result.strip()}\n') if __name__ == '__main__': diff --git a/examples/nlp/token_classification/token_classification.py b/examples/nlp/token_classification/token_classification.py index 3369dede8c02..eba8a470dd2e 100644 --- a/examples/nlp/token_classification/token_classification.py +++ b/examples/nlp/token_classification/token_classification.py @@ -12,8 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +## Tasks +Token Classificatin script supports Named Entity Recognition task and other token level classification tasks, +as long as the data followes the format specified below. + +Token Classification Model requires the data to be splitted into 2 files: text.txt and labels.txt. +Each line of the text.txt file contains text sequences, where words are separated with spaces, i.e.: +[WORD] [SPACE] [WORD] [SPACE] [WORD]. +The labels.txt file contains corresponding labels for each word in text.txt, the labels are separated with spaces, i.e.: +[LABEL] [SPACE] [LABEL] [SPACE] [LABEL]. + +Example of a text.txt file: +Jennifer is from New York City . +She likes ... +... + +Corresponding labels.txt file: +B-PER O O B-LOC I-LOC I-LOC O +O O ... +... + + +## Preparing the dataset +To convert an IOB format data to the format required for training, run +examples/nlp/token_classification/data/import_from_iob_format.py on your train and dev files, as follows: + +python examples/nlp/token_classification/data/import_from_iob_format.py --data_file PATH_TO_IOB_FORMAT_DATAFILE + + +## Model Training + +To train TokenClassification model with the default config file, run: + + python token_classification.py \ + model.dataset.data_dir= \ + trainer.max_epochs= \ + trainer.gpus="[] + +More details on how to use this script could be found in +tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb +""" +import os + import pytorch_lightning as pl -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from nemo.collections.nlp.models import TokenClassificationModel from nemo.core.config import hydra_runner @@ -23,14 +66,42 @@ @hydra_runner(config_path="conf", config_name="token_classification_config") def main(cfg: DictConfig) -> None: - logging.info(f'Config: {cfg.pretty()}') + logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) + exp_dir = exp_manager(trainer, cfg.get("exp_manager", None)) model = TokenClassificationModel(cfg.model, trainer=trainer) trainer.fit(model) if cfg.model.nemo_path: model.save_to(cfg.model.nemo_path) + """ + After model training is done, you can use the model for inference. + You can either evaluate data from a text_file that follows training data format, + or provide a list of queries you want to add entities to + """ + # run evaluation on a dataset from file + model.evaluate_from_file( + text_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.text_file), + labels_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.labels_file), + output_dir=exp_dir, + add_confusion_matrix=True, + normalize_confusion_matrix=True, + ) + + # run an inference on a few examples + queries = [ + 'we bought four shirts from the nvidia gear store in santa clara.', + 'Nvidia is a company.', + 'The Adventures of Tom Sawyer by Mark Twain is an 1876 novel about a young boy growing ' + + 'up along the Mississippi River.', + ] + results = model.add_predictions(queries) + + for query, result in zip(queries, results): + logging.info('') + logging.info(f'Query : {query}') + logging.info(f'Result: {result.strip()}\n') + if __name__ == '__main__': main() diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py index dcde0cbee048..1bc8c7dca12e 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py @@ -440,22 +440,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), } - def __init__( - self, - queries: List[str], - max_seq_length: int, - tokenizer: TokenizerSpec, - ignore_extra_tokens=False, - ignore_start_end: Optional[bool] = False, - ): + def __init__(self, queries: List[str], max_seq_length: int, tokenizer: TokenizerSpec): """ Initializes BertPunctuationCapitalizationInferDataset. """ - features = get_features( - queries=queries, - max_seq_length=max_seq_length, - tokenizer=tokenizer, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) + features = get_features(queries=queries, max_seq_length=max_seq_length, tokenizer=tokenizer) self.all_input_ids = features[0] self.all_segment_ids = features[1] diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py index 3fefe57d7fcb..cfbec8f77532 100644 --- a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py +++ b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py @@ -38,8 +38,8 @@ def get_features( queries: List[str], - max_seq_length: int, tokenizer: TokenizerSpec, + max_seq_length: int = -1, label_ids: dict = None, pad_label: str = 'O', raw_labels: List[str] = None, @@ -50,8 +50,8 @@ def get_features( Processes the data and returns features. Args: queries: text sequences - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] tokenizer: such as NemoBertTokenizer + max_seq_length: max sequence length minus 2 for [CLS] and [SEP], when -1 - use the max len from the data pad_label: pad value use for labels. By default, it's the neutral label. raw_labels: list of labels for every word in a sequence label_ids: dict to map labels to label ids. @@ -111,8 +111,9 @@ def get_features( labels.append(pad_id) all_labels.append(labels) - max_seq_length = min(max_seq_length, max(sent_lengths)) - logging.info(f'Max length: {max_seq_length}') + max_seq_length_data = max(sent_lengths) + max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data + logging.info(f'Setting Max Seq length to: {max_seq_length}') get_stats(sent_lengths) too_long_count = 0 @@ -143,7 +144,7 @@ def get_features( logging.warning(f'{too_long_count} are longer than {max_seq_length}') - for i in range(min(len(all_input_ids), 5)): + for i in range(min(len(all_input_ids), 1)): logging.info("*** Example ***") logging.info("i: %s", i) logging.info("subtokens: %s", " ".join(list(map(str, all_subtokens[i])))) @@ -152,7 +153,7 @@ def get_features( logging.info("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i])))) if with_label: logging.info("labels: %s", " ".join(list(map(str, all_labels[i])))) - return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_labels) + return (all_input_ids, all_segment_ids, all_input_mask, all_subtokens_mask, all_loss_mask, all_labels) class BertTokenClassificationDataset(Dataset): @@ -188,8 +189,8 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), + 'loss_mask': NeuralType(('B', 'T'), MaskType()), 'labels': NeuralType(('B', 'T'), LabelsType()), } @@ -217,7 +218,10 @@ def __init__( tokenizer_type = type(tokenizer.tokenizer).__name__ vocab_size = getattr(tokenizer, "vocab_size", 0) features_pkl = os.path.join( - data_dir, "cached_{}_{}_{}_{}".format(filename, tokenizer_type, str(max_seq_length), str(vocab_size)), + data_dir, + "cached_{}_{}_{}_{}_{}".format( + filename, tokenizer_type, str(max_seq_length), str(vocab_size), str(num_samples) + ), ) master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 @@ -247,9 +251,9 @@ def __init__( labels_lines = dataset[1] features = get_features( - text_lines, - max_seq_length, - tokenizer, + queries=text_lines, + max_seq_length=max_seq_length, + tokenizer=tokenizer, pad_label=pad_label, raw_labels=labels_lines, label_ids=label_ids, @@ -271,8 +275,8 @@ def __init__( self.all_input_ids = features[0] self.all_segment_ids = features[1] self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] + self.all_subtokens_mask = features[3] + self.all_loss_mask = features[4] self.all_labels = features[5] def __len__(self): @@ -283,8 +287,8 @@ def __getitem__(self, idx): np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), + np.array(self.all_loss_mask[idx]), np.array(self.all_labels[idx]), ) @@ -303,7 +307,6 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), } @@ -313,17 +316,16 @@ def __init__( """ Initializes BertTokenClassificationInferDataset Args: - queries: list of queries to run inference on + queries: text sequences max_seq_length: max sequence length minus 2 for [CLS] and [SEP] tokenizer: such as NemoBertTokenizer """ - features = get_features(queries, max_seq_length, tokenizer) + features = get_features(queries=queries, max_seq_length=max_seq_length, tokenizer=tokenizer) self.all_input_ids = features[0] self.all_segment_ids = features[1] self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] + self.all_subtokens_mask = features[3] def __len__(self): return len(self.all_input_ids) @@ -333,6 +335,5 @@ def __getitem__(self, idx): np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), ) diff --git a/nemo/collections/nlp/data/token_classification/token_classification_descriptor.py b/nemo/collections/nlp/data/token_classification/token_classification_descriptor.py index 5633924c1a9c..6d277cf0c7d5 100644 --- a/nemo/collections/nlp/data/token_classification/token_classification_descriptor.py +++ b/nemo/collections/nlp/data/token_classification/token_classification_descriptor.py @@ -35,6 +35,7 @@ def __init__(self, data_dir: str, modes: List[str] = ['train', 'test', 'dev'], p It is going to look for the data files at {data_dir}/{mode}.txt """ self.data_dir = data_dir + self.label_ids = None unique_labels = set() for mode in modes: diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py index 2807e060eee2..422d0a35e990 100644 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py +++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py @@ -257,27 +257,18 @@ def _setup_dataloader_from_config(self, cfg: DictConfig): def _setup_infer_dataloader(self, queries: List[str], batch_size: int) -> 'torch.utils.data.DataLoader': """ - Setup function for a temporary data loader which wraps the provided audio file. + Setup function for a infer data loader. Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. + queries: lower cased text without punctuation + batch_size: batch size to use during inference Returns: A pytorch DataLoader. """ dataset = BertPunctuationCapitalizationInferDataset( - tokenizer=self.tokenizer, - queries=queries, - max_seq_length=self._cfg.dataset.max_seq_length, - ignore_extra_tokens=self._cfg.dataset.ignore_extra_tokens, - ignore_start_end=self._cfg.dataset.ignore_start_end, + tokenizer=self.tokenizer, queries=queries, max_seq_length=self._cfg.dataset.max_seq_length ) return torch.utils.data.DataLoader( @@ -295,7 +286,7 @@ def add_punctuation_capitalization(self, queries: List[str], batch_size: int = N Adds punctuation and capitalization to the queries. Use this method for debugging and prototyping. Args: queries: lower cased text without punctuation - batch_size: batch size to use during inference. + batch_size: batch size to use during inference Returns: result: text with added capitalization and punctuation """ @@ -305,15 +296,16 @@ def add_punctuation_capitalization(self, queries: List[str], batch_size: int = N batch_size = len(queries) logging.info(f'Using batch size {batch_size} for inference') - # We will store transcriptions here + # We will store the output here result = [] # Model's mode and device mode = self.training - device = self._device + device = 'cuda' if torch.cuda.is_available() else 'cpu' try: # Switch model to evaluation mode self.eval() + self = self.to(device) infer_datalayer = self._setup_infer_dataloader(queries, batch_size) # store predictions for all queries in a single list diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py index 3ea3972abc62..8f58fcb69345 100644 --- a/nemo/collections/nlp/models/token_classification/token_classification_model.py +++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import Dict, Optional +from typing import Dict, List, Optional, Union import torch from omegaconf import DictConfig, OmegaConf @@ -22,14 +22,19 @@ from nemo.collections.common.losses import CrossEntropyLoss from nemo.collections.common.tokenizers.tokenizer_utils import get_tokenizer -from nemo.collections.nlp.data.token_classification.token_classification_dataset import BertTokenClassificationDataset +from nemo.collections.nlp.data.token_classification.token_classification_dataset import ( + BertTokenClassificationDataset, + BertTokenClassificationInferDataset, +) from nemo.collections.nlp.data.token_classification.token_classification_descriptor import TokenClassificationDataDesc from nemo.collections.nlp.metrics.classification_report import ClassificationReport from nemo.collections.nlp.modules.common import TokenClassifier from nemo.collections.nlp.modules.common.common_utils import get_pretrained_lm_model -from nemo.core.classes import typecheck +from nemo.collections.nlp.parts.utils_funcs import get_classification_report, plot_confusion_matrix, tensor2list +from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.classes.modelPT import ModelPT from nemo.core.neural_types import NeuralType +from nemo.utils import logging __all__ = ['TokenClassificationModel'] @@ -48,13 +53,6 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): """Initializes Token Classification Model.""" - self.data_dir = cfg.dataset.data_dir - - modes = ["train", "test", "dev"] - self.data_desc = TokenClassificationDataDesc( - data_dir=self.data_dir, modes=modes, pad_label=cfg.dataset.pad_label - ) - self.tokenizer = get_tokenizer( tokenizer_name=cfg.language_model.tokenizer, pretrained_model_name=cfg.language_model.pretrained_model_name, @@ -66,7 +64,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): ), do_lower_case=cfg.language_model.do_lower_case, ) - # After this line self._cfg == cfg + + self._cfg = cfg + self.data_desc = None + self.update_data_dir(cfg.dataset.data_dir) + self.setup_loss(class_balancing=self._cfg.dataset.class_balancing) + super().__init__(cfg=cfg, trainer=trainer) self.bert_model = get_pretrained_lm_model( @@ -86,15 +89,41 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): use_transformer_init=self._cfg.head.use_transformer_init, ) - if self._cfg.dataset.class_balancing == 'weighted_loss' and self.data_desc: - # You may need to increase the number of epochs for convergence when using weighted_loss - self.loss = CrossEntropyLoss(logits_ndim=3, weight=self.data_desc.class_weights) - else: - self.loss = CrossEntropyLoss(logits_ndim=3) - + self.loss = self.setup_loss(class_balancing=self._cfg.dataset.class_balancing) # setup to track metrics self.classification_report = ClassificationReport(len(self._cfg.label_ids), label_ids=self._cfg.label_ids) + def update_data_dir(self, data_dir: str) -> None: + """ + Update data directory and get data stats with Data Descriptor + Weights are later used to setup loss + + Args: + data_dir: path to data directory + """ + modes = ["train", "test", "dev"] + self._cfg.dataset.data_dir = data_dir + logging.info(f'Setting model.dataset.data_dir to {data_dir}.') + + if os.path.exists(data_dir): + self.data_desc = TokenClassificationDataDesc( + data_dir=data_dir, modes=modes, pad_label=self._cfg.dataset.pad_label + ) + + def setup_loss(self, class_balancing: str = None): + """Setup loss + Call this method only after update_data_dir() so that self.data_desc has class weights stats + + Args: + class_balancing: whether to use class weights during training + """ + if class_balancing == 'weighted_loss' and self.data_desc: + # you may need to increase the number of epochs for convergence when using weighted_loss + loss = CrossEntropyLoss(logits_ndim=3, weight=self.data_desc.class_weights) + else: + loss = CrossEntropyLoss(logits_ndim=3) + return loss + @typecheck() def forward(self, input_ids, token_type_ids, attention_mask): hidden_states = self.bert_model( @@ -108,8 +137,7 @@ def training_step(self, batch, batch_idx): Lightning calls this inside the training loop with the data from the training dataloader passed in as `batch`. """ - # forward pass - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels = batch + input_ids, input_type_ids, input_mask, subtokens_mask, loss_mask, labels = batch logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask) @@ -121,7 +149,7 @@ def validation_step(self, batch, batch_idx): Lightning calls this inside the validation loop with the data from the validation dataloader passed in as `batch`. """ - input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels = batch + input_ids, input_type_ids, input_mask, subtokens_mask, loss_mask, labels = batch logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) val_loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask) @@ -155,7 +183,9 @@ def validation_epoch_end(self, outputs): } return {'val_loss': avg_loss, 'log': tensorboard_logs} - def setup_training_data(self, train_data_config: Optional[DictConfig]): + def setup_training_data(self, train_data_config: Optional[DictConfig] = None): + if train_data_config is None: + train_data_config = self._cfg.train_ds self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config) if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: @@ -163,21 +193,32 @@ def setup_training_data(self, train_data_config: Optional[DictConfig]): # save label maps to the config self._cfg.label_ids = OmegaConf.create(self.data_desc.label_ids) - def setup_validation_data(self, val_data_config: Optional[DictConfig]): + def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): + if val_data_config is None: + val_data_config = self._cfg.validation_ds self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config) def setup_test_data(self, test_data_config: Optional[DictConfig]): + if test_data_config is None: + test_data_config = self._cfg.test_ds self._test_dl = self.__setup_dataloader_from_config(cfg=test_data_config) - def _setup_dataloader_from_config(self, cfg: DictConfig): - if not os.path.exists(self.data_dir): - raise FileNotFoundError( - f"Dataset not found at {self.data_dir}. For NER, CoNLL-2003 dataset can be obtained at " - "https://github.com/kyzhouhzau/BERT-NER/tree/master/data." - ) + def _setup_dataloader_from_config(self, cfg: DictConfig) -> DataLoader: + """ + Setup dataloader from config + Args: + cfg: config for the dataloader + Return: + Pytorch Dataloader + """ + dataset_cfg = self._cfg.dataset + data_dir = dataset_cfg.data_dir + + if not os.path.exists(data_dir): + raise FileNotFoundError(f"Data directory is not found at: {data_dir}.") - text_file = os.path.join(self.data_dir, cfg.text_file) - labels_file = os.path.join(self.data_dir, cfg.labels_file) + text_file = os.path.join(data_dir, cfg.text_file) + labels_file = os.path.join(data_dir, cfg.labels_file) if not (os.path.exists(text_file) and os.path.exists(labels_file)): raise FileNotFoundError( @@ -191,26 +232,211 @@ def _setup_dataloader_from_config(self, cfg: DictConfig): dataset = BertTokenClassificationDataset( text_file=text_file, label_file=labels_file, - max_seq_length=self._cfg.dataset.max_seq_length, + max_seq_length=dataset_cfg.max_seq_length, tokenizer=self.tokenizer, num_samples=cfg.num_samples, - pad_label=self.data_desc.pad_label, + pad_label=dataset_cfg.pad_label, label_ids=self.data_desc.label_ids, - ignore_extra_tokens=self._cfg.dataset.ignore_extra_tokens, - ignore_start_end=self._cfg.dataset.ignore_start_end, - use_cache=self._cfg.dataset.use_cache, + ignore_extra_tokens=dataset_cfg.ignore_extra_tokens, + ignore_start_end=dataset_cfg.ignore_start_end, + use_cache=dataset_cfg.use_cache, ) - return torch.utils.data.DataLoader( + return DataLoader( dataset=dataset, collate_fn=dataset.collate_fn, batch_size=cfg.batch_size, shuffle=cfg.shuffle, - num_workers=cfg.num_workers, - pin_memory=cfg.pin_memory, - drop_last=cfg.drop_last, + num_workers=dataset_cfg.num_workers, + pin_memory=dataset_cfg.pin_memory, + drop_last=dataset_cfg.drop_last, + ) + + def _setup_infer_dataloader(self, queries: List[str], batch_size: int) -> 'torch.utils.data.DataLoader': + """ + Setup function for a infer data loader. + + Args: + queries: text + batch_size: batch size to use during inference + + Returns: + A pytorch DataLoader. + """ + + dataset = BertTokenClassificationInferDataset(tokenizer=self.tokenizer, queries=queries, max_seq_length=-1) + + return torch.utils.data.DataLoader( + dataset=dataset, + collate_fn=dataset.collate_fn, + batch_size=batch_size, + shuffle=False, + num_workers=self._cfg.dataset.num_workers, + pin_memory=self._cfg.dataset.pin_memory, + drop_last=False, ) + def _infer(self, queries: List[str], batch_size: int = None) -> List[int]: + """ + Get prediction for the queries + Args: + queries: text sequences + batch_size: batch size to use during inference. + Returns: + all_preds: model predictions + """ + # store predictions for all queries in a single list + all_preds = [] + mode = self.training + try: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + # Switch model to evaluation mode + self.eval() + self.to(device) + infer_datalayer = self._setup_infer_dataloader(queries, batch_size) + + for i, batch in enumerate(infer_datalayer): + input_ids, input_type_ids, input_mask, subtokens_mask = batch + + logits = self.forward( + input_ids=input_ids.to(device), + token_type_ids=input_type_ids.to(device), + attention_mask=input_mask.to(device), + ) + + subtokens_mask = subtokens_mask > 0.5 + preds = tensor2list(torch.argmax(logits, axis=-1)[subtokens_mask]) + all_preds.extend(preds) + finally: + # set mode back to its original value + self.train(mode=mode) + return all_preds + + def add_predictions(self, queries: Union[List[str], str], batch_size: int = 32) -> List[str]: + """ + Add predicted token labels to the queries. Use this method for debugging and prototyping. + Args: + queries: text + batch_size: batch size to use during inference. + Returns: + result: text with added entities + """ + if queries is None or len(queries) == 0: + return [] + + result = [] + all_preds = self._infer(queries, batch_size) + + queries = [q.strip().split() for q in queries] + num_words = [len(q) for q in queries] + if sum(num_words) != len(all_preds): + raise ValueError('Pred and words must have the same length') + + ids_to_labels = {v: k for k, v in self._cfg.label_ids.items()} + start_idx = 0 + end_idx = 0 + for query in queries: + end_idx += len(query) + + # extract predictions for the current query from the list of all predictions + preds = all_preds[start_idx:end_idx] + start_idx = end_idx + + query_with_entities = '' + for j, word in enumerate(query): + # strip out the punctuation to attach the entity tag to the word not to a punctuation mark + # that follows the word + if word[-1].isalpha(): + punct = '' + else: + punct = word[-1] + word = word[:-1] + + query_with_entities += word + label = ids_to_labels[preds[j]] + + if label != self._cfg.dataset.pad_label: + query_with_entities += '[' + label + ']' + query_with_entities += punct + ' ' + result.append(query_with_entities.strip()) + return result + + def evaluate_from_file( + self, + output_dir: str, + text_file: str, + labels_file: Optional[str] = None, + add_confusion_matrix: Optional[bool] = False, + normalize_confusion_matrix: Optional[bool] = True, + batch_size: int = 32, + ) -> List[str]: + """ + Run inference on data from a file, plot confusion matrix and calculate classification report. + Use this method for final evaluation. + Args: + output_dir: path to output directory to store model predictions, confusion matrix plot (if set to True) + text_file: path to file with text. Each line of the text.txt file contains text sequences, where words + are separated with spaces: [WORD] [SPACE] [WORD] [SPACE] [WORD] + labels_file (Optional): path to file with labels. Each line of the labels_file should contain + labels corresponding to each word in the text_file, the labels are separated with spaces: + [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).' + add_confusion_matrix: whether to generate confusion matrix + normalize_confusion_matrix: whether to normalize confusion matrix + batch_size: batch size to use during inference. + Returns: + result: text with added capitalization and punctuation + """ + output_dir = os.path.abspath(output_dir) + + with open(text_file, 'r') as f: + queries = f.readlines() + + all_preds = self._infer(queries, batch_size) + with_labels = labels_file is not None + if with_labels: + with open(labels_file, 'r') as f: + all_labels_str = f.readlines() + all_labels_str = ' '.join([labels.strip() for labels in all_labels_str]) + + # writing labels and predictions to a file in output_dir is specified in the config + os.makedirs(output_dir, exist_ok=True) + filename = os.path.join(output_dir, 'infer_' + os.path.basename(text_file)) + with open(filename, 'w') as f: + if with_labels: + f.write('labels\t' + all_labels_str + '\n') + logging.info(f'Labels save to {filename}') + + # convert labels from string label to ids + ids_to_labels = {v: k for k, v in self._cfg.label_ids.items()} + all_preds_str = [ids_to_labels[pred] for pred in all_preds] + f.write('preds\t' + ' '.join(all_preds_str) + '\n') + logging.info(f'Predictions saved to {filename}') + + if with_labels and add_confusion_matrix: + all_labels = all_labels_str.split() + # convert labels from string label to ids + label_ids = self._cfg.label_ids + all_labels = [label_ids[label] for label in all_labels] + print(len(all_labels), len(all_preds)) + plot_confusion_matrix( + all_labels, all_preds, output_dir, label_ids=label_ids, normalize=normalize_confusion_matrix + ) + logging.info(get_classification_report(all_labels, all_preds, label_ids)) + @classmethod - def list_available_models(cls) -> Optional[Dict[str, str]]: - pass + def list_available_models(cls) -> Optional[PretrainedModelInfo]: + """ + This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. + + Returns: + List of available pre-trained models. + """ + result = [] + model = PretrainedModelInfo( + pretrained_model_name="NERModel", + location="https://nemo-public.s3.us-east-2.amazonaws.com/nemo-1.0.0alpha-tests/ner.nemo", + description="The model was trained on GMB (Groningen Meaning Bank) corpus for entity recognition and " + + "achieves 74.61 F1 Macro score.", + ) + result.append(model) + return result diff --git a/nemo/collections/nlp/parts/utils_funcs.py b/nemo/collections/nlp/parts/utils_funcs.py index db7f9b175a37..b002ce9962d9 100644 --- a/nemo/collections/nlp/parts/utils_funcs.py +++ b/nemo/collections/nlp/parts/utils_funcs.py @@ -12,12 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['list2str', 'tensor2list'] +__all__ = ['list2str', 'tensor2list', 'plot_confusion_matrix', 'get_classification_report'] -from typing import List, Union +import os +import time +from typing import Dict, List, Union +import numpy as np +from matplotlib import pyplot as plt +from sklearn.metrics import classification_report, confusion_matrix from torch import Tensor +from nemo.utils import logging + def list2str(l: List[int]) -> str: """ Converts list to a string""" @@ -27,3 +34,96 @@ def list2str(l: List[int]) -> str: def tensor2list(tensor: Tensor) -> List[Union[int, float]]: """ Converts tensor to a list """ return tensor.detach().cpu().tolist() + + +def plot_confusion_matrix( + labels: List[int], + preds: List[int], + graph_fold: str, + label_ids: Dict[str, int] = None, + normalize: bool = False, + prefix: str = '', +): + ''' + Plot confusion matrix. + Args: + labels: ground truth labels + preds: model predictions + graph_fold: path to a folder to store confusion matrix plot + label_ids: str label to id map, for example: {'O': 0, 'LOC': 1} + normalize: whether to normalize confusion matrix + prefix: prefix for the plot name + ''' + if label_ids is None: + _plot_confusion_matrix(labels, preds, graph_fold) + + else: + # remove labels from label_ids that don't appear in the dev set + used_labels = set(labels) | set(preds) + label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels} + + ids_to_labels = {label_ids[k]: k for k in label_ids} + classes = [ids_to_labels[id] for id in sorted(label_ids.values())] + + title = 'Confusion_matrix' + cm = confusion_matrix(labels, preds) + if normalize: + sums = cm.sum(axis=1)[:, np.newaxis] + sums = np.where(sums == 0, 1, sums) + cm = cm.astype('float') / sums + title = 'Normalized_' + title + + fig = plt.figure() + ax = fig.add_subplot(111) + + cax = ax.matshow(cm) + ax.set_xticks(np.arange(-1, len(classes) + 1)) + ax.set_yticks(np.arange(-1, len(classes) + 1)) + ax.set_xticklabels([''] + classes, rotation=90) + ax.set_yticklabels([''] + classes) + ax.set_ylabel('True') + ax.set_xlabel('Predicted') + + os.makedirs(graph_fold, exist_ok=True) + fig.colorbar(cax) + + title = (prefix + title).strip() + fig_name = os.path.join(graph_fold, title + '_' + time.strftime('%Y%m%d-%H%M%S')) + plt.savefig(fig_name) + logging.info(f'Confusion matrix saved to {fig_name}') + + +def _plot_confusion_matrix(labels: List[int], preds: List[int], graph_fold: str): + """ + Plot confusion matrix + Args: + labels: ground truth labels + preds: model predictions + graph_fold: path to a folder to store confusion matrix plot + """ + cm = confusion_matrix(labels, preds) + logging.info(f'Confusion matrix:\n{cm}') + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(cm) + plt.title('Confusion matrix of the classifier') + fig.colorbar(cax) + plt.xlabel('Predicted') + plt.ylabel('True') + os.makedirs(graph_fold, exist_ok=True) + plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S'))) + + +def get_classification_report(labels, preds, label_ids, output_dict=False): + """ + Returns classification report + """ + # remove labels from label_ids that don't appear in predictions or ground truths + used_labels = set(labels) | set(preds) + labels_names = [ + k + ' (label id: ' + str(v) + ')' + for k, v in sorted(label_ids.items(), key=lambda item: item[1]) + if v in used_labels + ] + + return classification_report(labels, preds, target_names=labels_names, digits=4, output_dict=output_dict) diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb index 5d7ae458c7e0..aeeef6719c62 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb @@ -316,7 +316,7 @@ "- the other one - capitalization.\n", "\n", "The model is defined in a config file which declares multiple important sections. They are:\n", - "- **model**: All arguments that will relate to the Model - language model, token classifiers, optimizer and schedulers, datasets and any other related information\n", + "- **model**: All arguments that are related to the Model - language model, token classifiers, optimizer and schedulers, dataset and any other related information\n", "\n", "- **trainer**: Any argument to be passed to PyTorch Lightning" ] @@ -334,7 +334,7 @@ "os.makedirs(config_dir, exist_ok=True)\n", "if not os.path.exists(config_dir + MODEL_CONFIG):\n", " print('Downloading config file...')\n", - " wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/pc_tutorial/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml', config_dir)\n", + " wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/' + BRANCH + '/examples/nlp/token_classification/conf/' + MODEL_CONFIG, config_dir)\n", "else:\n", " print ('config file is already exists')" ], @@ -374,7 +374,7 @@ "\n", "`model.validation_ds.ds_item=[PATH_TO_DEV1,PATH_TO_DEV2]` (Note no space between the paths and square brackets).\n", "\n", - "Also notice that some configs, including `model.dataset.data_dir` have `???` in place of paths, this values are required to be specified by the user.\n", + "Also notice that some configs, including `model.dataset.data_dir`, have `???` in place of paths, this values are required to be specified by the user.\n", "\n", "Let's now add the data directory path to the config." ] @@ -494,7 +494,7 @@ "colab_type": "text" }, "source": [ - "Before initializing the model, we might want to modify some of the model configs." + "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model." ] }, { @@ -505,11 +505,10 @@ "colab": {} }, "source": [ - "# specify BERT-like model, you want to use\n", - "PRETRAINED_BERT_MODEL = \"bert-base-uncased\"\n", - "\n", "# complete list of supported BERT-like models\n", - "nemo_nlp.modules.get_pretrained_lm_models_list()" + "nemo_nlp.modules.get_pretrained_lm_models_list()\n", + "\n", + "PRETRAINED_BERT_MODEL = \"bert-base-uncased\"" ], "execution_count": null, "outputs": [] @@ -528,26 +527,37 @@ "config.model.validation_ds.batch_size = BATCH_SIZE\n", "config.model.optim.lr = LEARNING_RATE\n", "config.model.train_ds.num_samples = NUM_SAMPLES\n", - "config.model.validation_ds.num_samples = NUM_SAMPLES" + "config.model.validation_ds.num_samples = NUM_SAMPLES\n" ], "execution_count": null, "outputs": [] }, { - "cell_type": "code", + "cell_type": "markdown", + "source": [ + "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n", + "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model." + ], "metadata": { - "id": "NgsGLydWo-6-", - "colab_type": "code", - "colab": {} - }, + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], "source": [ "# initialize the model\n", - "# dataset we'll be prepared for training and evaluation during\n", + "# during this stage, the dataset and data loaders we'll be prepared for training and evaluation\n", "config.trainer.max_epochs = 3\n", "model = nemo_nlp.models.PunctuationCapitalizationModel(cfg=config.model, trainer=trainer)" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } }, { "cell_type": "markdown", diff --git a/tutorials/nlp/Token_Classification_Named_Entity_Recognition_tutorial.ipynb b/tutorials/nlp/Token_Classification_Named_Entity_Recognition_tutorial.ipynb new file mode 100644 index 000000000000..a1308e63b813 --- /dev/null +++ b/tutorials/nlp/Token_Classification_Named_Entity_Recognition_tutorial.ipynb @@ -0,0 +1,806 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Token_Classification_Named_Entity_Recognition_tutorial.ipynb", + "provenance": [], + "private_outputs": true, + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "uRLPr0TnIAHO", + "colab_type": "code", + "colab": {} + }, + "source": [ + "BRANCH = 'main'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "o_0K1lsW1dj9", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell\n", + "\n", + "# install NeMo\n", + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[nlp]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dzqD2WDFOIN-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from nemo.collections import nlp as nemo_nlp\n", + "from nemo.utils.exp_manager import exp_manager\n", + "\n", + "import os\n", + "import wget \n", + "import torch\n", + "import pytorch_lightning as pl\n", + "from omegaconf import OmegaConf" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "daYw_Xll2ZR9", + "colab_type": "text" + }, + "source": [ + "# Task Description\n", + "**Named entity recognition (NER)**, also referred to as entity chunking, identification or extraction, is the task of detecting and classifying key information (entities) in text.\n", + "For example, in a sentence: `Mary lives in Santa Clara and works at NVIDIA`, we should detect that `Mary` is a person, `Santa Clara` is a location and `NVIDIA` is a company." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZnuziSwJ1yEB", + "colab_type": "text" + }, + "source": [ + "# Dataset\n", + "\n", + "In this tutorial we going to use [GMB(Groningen Meaning Bank)](http://www.let.rug.nl/bjerva/gmb/about.php) corpus for entity recognition. \n", + "\n", + "GMB is a fairly large corpus with a lot of annotations. Note, that GMB is not completely human annotated and it’s not considered 100% correct. \n", + "The data is labeled using the [IOB format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) (short for inside, outside, beginning). \n", + "\n", + "The following classes appear in the dataset:\n", + "* LOC = Geographical Entity\n", + "* ORG = Organization\n", + "* PER = Person\n", + "* GPE = Geopolitical Entity\n", + "* TIME = Time indicator\n", + "* ART = Artifact\n", + "* EVE = Event\n", + "* NAT = Natural Phenomenon\n", + "\n", + "For this tutorial, classes ART, EVE, and NAT were combined into a MISC class due to small number of examples for these classes.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qzcZ3nb_-SVT", + "colab_type": "text" + }, + "source": [ + "# NeMo Token Classification Data Format\n", + "\n", + "[TokenClassification Model](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/token_classification/token_classification_model.py) in NeMo supports NER and other token level classification tasks, as long as the data follows the format specified below. \n", + "\n", + "Token Classification Model requires the data to be splitted into 2 files: \n", + "* text.txt and \n", + "* labels.txt. \n", + "\n", + "Each line of the **text.txt** file contains text sequences, where words are separated with spaces, i.e.: \n", + "[WORD] [SPACE] [WORD] [SPACE] [WORD].\n", + "\n", + "The **labels.txt** file contains corresponding labels for each word in text.txt, the labels are separated with spaces, i.e.:\n", + "[LABEL] [SPACE] [LABEL] [SPACE] [LABEL].\n", + "\n", + "Example of a text.txt file:\n", + "```\n", + "Jennifer is from New York City .\n", + "She likes ...\n", + "...\n", + "```\n", + "Corresponding labels.txt file:\n", + "```\n", + "B-PER O O B-LOC I-LOC I-LOC O\n", + "O O ...\n", + "...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VsEmwIPO4L4V", + "colab_type": "text" + }, + "source": [ + "To convert an IOB format data to the format required for training, run `examples/nlp/token_classification/data/import_from_iob_format.py` on your train and dev files, as follows:\n", + "\n", + "\n", + "\n", + "\n", + "```\n", + "python examples/nlp/token_classification/data/import_from_iob_format.py --data_file PATH_TO_IOB_FORMAT_DATAFILE\n", + "```\n", + "\n", + "For this tutorial, we are going to use the preprocessed GMB dataset.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SL58EWkd2ZVb", + "colab_type": "text" + }, + "source": [ + "## Download and preprocess the data¶" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "n8HZrDmr12_-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "DATA_DIR = \"DATA_DIR\"\n", + "WORK_DIR = \"WORK_DIR\"\n", + "MODEL_CONFIG = \"token_classification_config.yaml\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jrx2ZXHrCHb_", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# download preprocessed data\n", + "os.makedirs(WORK_DIR, exist_ok=True)\n", + "os.makedirs(DATA_DIR, exist_ok=True)\n", + "print('Downloading GMB data...')\n", + "wget.download('https://dldata-public.s3.us-east-2.amazonaws.com/gmb_v_2.2.0_clean.zip', DATA_DIR)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NhUzIeF0Yg0l", + "colab_type": "text" + }, + "source": [ + "Let's extract files from the .zip file:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Y01BdjPRW-7B", + "colab_type": "code", + "colab": {} + }, + "source": [ + "! unzip {DATA_DIR}/gmb_v_2.2.0_clean.zip -d {DATA_DIR}\n", + "DATA_DIR = os.path.join(DATA_DIR, 'gmb_v_2.2.0_clean')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U8Ty5_S7Ye8h", + "colab_type": "text" + }, + "source": [ + "Now, the data folder should contain 4 files:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L8vsyh3JZH26", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "* labels_dev.txt\n", + "* labels_train.txt\n", + "* text_dev.txt\n", + "* text_train.txt\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qB0oLE4R9EhJ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "! ls -l {DATA_DIR}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6UDPgadLN6SG", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# let's take a look at the data \n", + "print('Text:')\n", + "! head -n 5 {DATA_DIR}/text_train.txt\n", + "\n", + "print('\\nLabels:')\n", + "! head -n 5 {DATA_DIR}/labels_train.txt" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "daludzzL2Jba", + "colab_type": "text" + }, + "source": [ + "# Model Configuration" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tit5kG4Z5SXu", + "colab_type": "text" + }, + "source": [ + "# Using an Out-of-the-Box Model\n", + "\n", + "To use a pretrained NER model, run:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BKe5Jn4u9xng", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# this line will download pre-trained NER model from NVIDIA's NGC cloud and instantiate it for you\n", + "pretrained_ner_model = nemo_nlp.models.TokenClassificationModel.from_pretrained(model_name=\"NERModel\") " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "y8SFxPJd-hkH" + }, + "source": [ + "To see how the model performs, let’s get model's predictions for a few examples:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DQhsamclRtxJ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# define the list of queries for inference\n", + "queries = [\n", + " 'we bought four shirts from the nvidia gear store in santa clara.',\n", + " 'Nvidia is a company.',\n", + " 'The Adventures of Tom Sawyer by Mark Twain is an 1876 novel about a young boy growing '\n", + " + 'up along the Mississippi River.',\n", + "]\n", + "results = pretrained_ner_model.add_predictions(queries)\n", + "\n", + "for query, result in zip(queries, results):\n", + " print()\n", + " print(f'Query : {query}')\n", + " print(f'Result: {result.strip()}\\n')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_whKCxfTMo6Y", + "colab_type": "text" + }, + "source": [ + "Now, let's take a closer look at the model's configuration and learn to train the model from scratch and finetune the pretrained model.\n", + "\n", + "# Model configuration\n", + "\n", + "Our Named Entity Recognition model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Token Classification layer.\n", + "\n", + "The model is defined in a config file which declares multiple important sections. They are:\n", + "- **model**: All arguments that are related to the Model - language model, token classifier, optimizer and schedulers, datasets and any other related information\n", + "\n", + "- **trainer**: Any argument to be passed to PyTorch Lightning" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "T1gA8PsJ13MJ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# download the model's configuration file \n", + "config_dir = WORK_DIR + '/configs/'\n", + "os.makedirs(config_dir, exist_ok=True)\n", + "if not os.path.exists(config_dir + MODEL_CONFIG):\n", + " print('Downloading config file...')\n", + " wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/' + BRANCH + '/examples/nlp/token_classification/conf/' + MODEL_CONFIG, config_dir)\n", + "else:\n", + " print ('config file is already exists')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "mX3KmWMvSUQw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# this line will print the entire config of the model\n", + "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", + "print(config_path)\n", + "config = OmegaConf.load(config_path)\n", + "print(OmegaConf.to_yaml(config))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZCgWzNBkaQLZ", + "colab_type": "text" + }, + "source": [ + "# Model Training From Scratch\n", + "## Setting up Data within the config\n", + "\n", + "Among other things, the config file contains dictionaries called dataset, train_ds and validation_ds. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", + "\n", + "We assume that both training and evaluation files are located in the same directory, and use the default names mentioned during the data download step. \n", + "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n", + "\n", + "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n", + "\n", + "Let's now add the data directory path to the config." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LQHCJN-ZaoLp", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# in this tutorial train and dev datasets are located in the same folder, so it is enought to add the path of the data directory to the config\n", + "config.model.dataset.data_dir = DATA_DIR\n", + "\n", + "# if you want to decrease the size of your datasets, uncomment the lines below:\n", + "# NUM_SAMPLES = 1000\n", + "# config.model.train_ds.num_samples = NUM_SAMPLES\n", + "# config.model.validation_ds.num_samples = NUM_SAMPLES" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nB96-3sTc3yk", + "colab_type": "text" + }, + "source": [ + "## Building the PyTorch Lightning Trainer\n", + "\n", + "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n", + "\n", + "Let's first instantiate a Trainer object" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1tG4FzZ4Ui60", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(\"Trainer config - \\n\")\n", + "print(OmegaConf.to_yaml(config.trainer))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "knF6QeQQdMrH", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# lets modify some trainer configs\n", + "# checks if we have GPU available and uses it\n", + "cuda = 1 if torch.cuda.is_available() else 0\n", + "config.trainer.gpus = cuda\n", + "\n", + "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n", + "\n", + "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n", + "# config.trainer.amp_level = O1\n", + "\n", + "# remove distributed training flags\n", + "config.trainer.distributed_backend = None\n", + "\n", + "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n", + "config.trainer.max_steps = 5\n", + "\n", + "trainer = pl.Trainer(**config.trainer)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8IlEMdVxdr6p", + "colab_type": "text" + }, + "source": [ + "## Setting up a NeMo Experiment¶\n", + "\n", + "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8uztqGAmdrYt", + "colab_type": "code", + "colab": {} + }, + "source": [ + "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", + "\n", + "# the exp_dir provides a path to the current experiment for easy access\n", + "exp_dir = str(exp_dir)\n", + "exp_dir" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8tjLhUvL_o7_", + "colab_type": "text" + }, + "source": [ + "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Xeuc2i7Y_nP5", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# complete list of supported BERT-like models\n", + "print(nemo_nlp.modules.get_pretrained_lm_models_list())\n", + "\n", + "# specify BERT-like model, you want to use\n", + "PRETRAINED_BERT_MODEL = \"bert-base-uncased\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "RK2xglXyAUOO", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# add the specified above model parameters to the config\n", + "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fzNZNAVRjDD-", + "colab_type": "text" + }, + "source": [ + "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n", + "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NgsGLydWo-6-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "model_from_scratch = nemo_nlp.models.TokenClassificationModel(cfg=config.model, trainer=trainer)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kQ592Tx4pzyB", + "colab_type": "text" + }, + "source": [ + "## Monitoring training progress\n", + "Optionally, you can create a Tensorboard visualization to monitor training progress." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mTJr16_pp0aS", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load the TensorBoard notebook extension\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir {exp_dir}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "hUvnSpyjp0Dh", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# start model training\n", + "trainer.fit(model_from_scratch)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JxBiIKMlH8yv", + "colab_type": "text" + }, + "source": [ + "After training for 5 epochs, with the default config and NUM_SAMPLES = -1 (i.e.all data is used), your model performance should look similar to this: \n", + "```\n", + " label precision recall f1 support \n", + " O (label_id: 0) 99.14 99.19 99.17 131141\n", + " B-GPE (label_id: 1) 95.86 94.03 94.93 2362\n", + " B-LOC (label_id: 2) 83.99 90.31 87.04 5346\n", + " B-MISC (label_id: 3) 39.82 34.62 37.04 130\n", + " B-ORG (label_id: 4) 78.33 67.82 72.70 2980\n", + " B-PER (label_id: 5) 84.36 84.32 84.34 2577\n", + " B-TIME (label_id: 6) 91.94 91.23 91.58 2975\n", + " I-GPE (label_id: 7) 88.89 34.78 50.00 23\n", + " I-LOC (label_id: 8) 77.18 79.13 78.14 1030\n", + " I-MISC (label_id: 9) 28.57 24.00 26.09 75\n", + " I-ORG (label_id: 10) 78.67 75.67 77.14 2384\n", + " I-PER (label_id: 11) 86.69 90.17 88.40 2687\n", + " I-TIME (label_id: 12) 83.21 83.48 83.34 938\n", + " -------------------\n", + " micro avg 96.95 96.95 96.95 154648\n", + " macro avg 78.20 72.98 74.61 154648\n", + " weighted avg 96.92 96.95 96.92 154648\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VPdzJVAgSFaJ", + "colab_type": "text" + }, + "source": [ + "# Inference\n", + "\n", + "To see how the model performs, we can run generate prediction similar to the way we did it earlier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QaW0A1OOwefR", + "colab_type": "text" + }, + "source": [ + "## Generate Predictions\n", + "\n", + "To see how the model performs, we can generate prediction the same way we did it earlier or we can use our model to generate predictions for a dataset from a file, for example, to perform final evaluation or to do error analysis.\n", + "Below, we are using a subset of dev set, but it could be any text file as long as it follows the data format described above.\n", + "Labels_file is optional here, and if provided will be used to get metrics and plot confusion matrix." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "92PB0iTqNnW-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# let's first create a subset of our dev data\n", + "! head -n 100 {DATA_DIR}/text_dev.txt > {DATA_DIR}/sample_text_dev.txt\n", + "! head -n 100 {DATA_DIR}/labels_dev.txt > {DATA_DIR}/sample_labels_dev.txt" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vXnx2tKoOohy", + "colab_type": "text" + }, + "source": [ + "Now, let's generate predictions for the provided text file.\n", + "If labels file is also specified, the model will evaluate the predictions and plot confusion matrix. " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "sglcZV1bwsv0", + "colab_type": "code", + "colab": {} + }, + "source": [ + "model_from_scratch.evaluate_from_file(\n", + " text_file=os.path.join(DATA_DIR, 'sample_text_dev.txt'),\n", + " labels_file=os.path.join(DATA_DIR, 'sample_labels_dev.txt'),\n", + " output_dir=exp_dir,\n", + " add_confusion_matrix=True,\n", + " normalize_confusion_matrix=True,\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ref1qSonGNhP", + "colab_type": "text" + }, + "source": [ + "## Training Script\n", + "\n", + "If you have NeMo installed locally, you can also train the model with `nlp/token_classification/token_classification.py.`\n", + "\n", + "To run training script, use:\n", + "\n", + "`python token_classification.py model.dataset.data_dir=PATH_TO_DATA_DIR`\n", + "\n", + "# Finetuning model with your data\n", + "\n", + "When we were training from scratch, the datasets were prepared for training during the model initialization. When we are using a pretrained NER model, before training, we need to setup training and evaluation data.\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yu9fZc2vPQfw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# let's reload our pretrained NER model\n", + "pretrained_ner_model = nemo_nlp.models.TokenClassificationModel.from_pretrained('NERModel')\n", + "\n", + "# then we need to setup the data dir to get class weights statistics\n", + "pretrained_ner_model.update_data_dir(DATA_DIR)\n", + "\n", + "# then we're setting up loss, use class_balancing='weighted_loss' if you want to add class weights to the CrossEntropyLoss\n", + "pretrained_ner_model.setup_loss(class_balancing='weighted_loss')\n", + "\n", + "# setup train and validation Pytorch DataLoaders\n", + "pretrained_ner_model.setup_training_data()\n", + "pretrained_ner_model.setup_validation_data()\n", + "\n", + "# and now we can create a PyTorch Lightning trainer and call `fit` again\n", + "# for this tutorial we are setting fast_dev_run to True, and the trainer will run 1 training batch and 1 validation batch\n", + "# for actual model training, disable the flag\n", + "fast_dev_run = True\n", + "trainer = pl.Trainer(gpus=[1], fast_dev_run=fast_dev_run)\n", + "trainer.fit(pretrained_ner_model)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 486149c3b34a08b7ea7a9fb641b11e9fb65fd6ab Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 31 Aug 2020 10:14:00 -0700 Subject: [PATCH 07/12] Support overriding the config path when restoring model (#1088) * Add restore from override option and test Signed-off-by: smajumdar * Correct strict signature Signed-off-by: smajumdar * Add support for extracting state dict from nemo files Signed-off-by: smajumdar * correct signature of abstract FileIO method Signed-off-by: smajumdar Co-authored-by: Oleksii Kuchaiev --- nemo/core/classes/common.py | 2 +- nemo/core/classes/modelPT.py | 104 +++++++++++++++++++++++++++++++-- tests/core/test_fileio.py | 109 ++++++++++++++++++++++++++++++++++- 3 files changed, 208 insertions(+), 7 deletions(-) diff --git a/nemo/core/classes/common.py b/nemo/core/classes/common.py index 85964cd2dfe4..f343d75c7407 100644 --- a/nemo/core/classes/common.py +++ b/nemo/core/classes/common.py @@ -281,7 +281,7 @@ def save_to(self, save_path: str): @classmethod @abstractmethod - def restore_from(cls, restore_path: str): + def restore_from(cls, restore_path: str, override_config_path: Optional[str] = None): """Restores module/model with weights""" pass diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 238921637339..489fdee0020b 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -179,11 +179,13 @@ def save_to(self, save_path: str): self.__make_nemo_file_from_folder(filename=save_path, source_dir=tmpdir) @classmethod - def restore_from(cls, restore_path: str): + def restore_from(cls, restore_path: str, override_config_path: Optional[str] = None): """ Restores model instance (weights and configuration) into .nemo file Args: restore_path: path to .nemo file from which model should be instantiated + override_config_path: path to a yaml config that will override the internal + config file Example: ``` @@ -204,9 +206,19 @@ def restore_from(cls, restore_path: str): cls.__set_model_restore_state(is_being_restored=True) cls.__unpack_nemo_file(path2file=restore_path, out_folder=tmpdir) os.chdir(tmpdir) - config_yaml = path.join(tmpdir, _MODEL_CONFIG_YAML) - model_weights = path.join(tmpdir, _MODEL_WEIGHTS) + if override_config_path is None: + config_yaml = path.join(tmpdir, _MODEL_CONFIG_YAML) + else: + config_yaml = override_config_path conf = OmegaConf.load(config_yaml) + if override_config_path is not None: + # Resolve the override config + conf = OmegaConf.to_container(conf, resolve=True) + conf = OmegaConf.create(conf) + # If override is top level config, extract just `model` from it + if 'model' in conf: + conf = conf.model + model_weights = path.join(tmpdir, _MODEL_WEIGHTS) OmegaConf.set_struct(conf, True) instance = cls.from_config_dict(config=conf) instance.load_state_dict(torch.load(model_weights)) @@ -218,6 +230,83 @@ def restore_from(cls, restore_path: str): return instance + @classmethod + def extract_state_dict_from(cls, restore_path: str, save_dir: str, split_by_module: bool = False): + """ + Extract the state dict(s) from a provided .nemo tarfile and save it to a directory. + Args: + restore_path: path to .nemo file from which state dict(s) should be extracted + save_dir: directory in which the saved state dict(s) should be stored + split_by_module: bool flag, which determins whether the output checkpoint should + be for the entire Model, or the individual module's that comprise the Model + + Example: + To convert the .nemo tarfile into a single Model level PyTorch checkpoint + ``` + state_dict = nemo.collections.asr.models.EncDecCTCModel.extract_state_dict_from('asr.nemo', './asr_ckpts) + ``` + + To restore a model from a Model level checkpoint + ``` + model = nemo.collections.asr.models.EncDecCTCModel(cfg) # or any other method of restoration + model.load_state_dict(torch.load("./asr_ckpts/model_weights.ckpt")) + ``` + + To convert the .nemo tarfile into multiple Module level PyTorch checkpoints + ``` + state_dict = nemo.collections.asr.models.EncDecCTCModel.extract_state_dict_from('asr.nemo', './asr_ckpts, + split_by_module=True) + ``` + + To restore a module from a Module level checkpoint + ``` + model = model = nemo.collections.asr.models.EncDecCTCModel(cfg) # or any other method of restoration + + # load the individual components + model.preprocessor.load_state_dict(torch.load("./asr_ckpts/preprocessor.ckpt")) + model.encoder.load_state_dict(torch.load("./asr_ckpts/encoder.ckpt")) + model.decoder.load_state_dict(torch.load("./asr_ckpts/decoder.ckpt")) + ``` + + Returns: + The state dict that was loaded from the original .nemo checkpoint + """ + if not path.exists(restore_path): + raise FileExistsError(f"Can't find {restore_path}") + + cwd = os.getcwd() + + save_dir = os.path.abspath(save_dir) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + + with tempfile.TemporaryDirectory() as tmpdir: + try: + cls.__unpack_nemo_file(path2file=restore_path, out_folder=tmpdir) + os.chdir(tmpdir) + model_weights = path.join(tmpdir, _MODEL_WEIGHTS) + state_dict = torch.load(model_weights) + + if not split_by_module: + filepath = os.path.join(save_dir, _MODEL_WEIGHTS) + torch.save(state_dict, filepath) + + else: + key_set = set([key.split(".")[0] for key in state_dict.keys()]) + for primary_key in key_set: + inner_keys = [key for key in state_dict.keys() if key.split(".")[0] == primary_key] + state_dict_subset = { + ".".join(inner_key.split(".")[1:]): state_dict[inner_key] for inner_key in inner_keys + } + filepath = os.path.join(save_dir, f"{primary_key}.ckpt") + torch.save(state_dict_subset, filepath) + + logging.info(f'Checkpoints from {restore_path} were successfully extracted into {save_dir}.') + finally: + os.chdir(cwd) + + return state_dict + @classmethod def load_from_checkpoint( cls, @@ -225,19 +314,24 @@ def load_from_checkpoint( *args, map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None, hparams_file: Optional[str] = None, + strict: bool = True, **kwargs, ): """ Loads ModelPT from checkpoint, with some maintenance of restoration. For documentation, please refer to LightningModule.load_from_checkpoin() documentation. """ - # TODO (@titu1994): When PTL 0.9+ is supported, add `strict=False` flag to constructor checkpoint = None try: cls.__set_model_restore_state(is_being_restored=True) checkpoint = super().load_from_checkpoint( - checkpoint_path=checkpoint_path, *args, map_location=map_location, hparams_file=hparams_file, **kwargs + checkpoint_path=checkpoint_path, + *args, + map_location=map_location, + hparams_file=hparams_file, + strict=strict, + **kwargs, ) finally: diff --git a/tests/core/test_fileio.py b/tests/core/test_fileio.py index d116ffaaee15..8f28e603f1ea 100644 --- a/tests/core/test_fileio.py +++ b/tests/core/test_fileio.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import tempfile import numpy as np import pytest -from omegaconf import DictConfig +import torch +from omegaconf import DictConfig, OmegaConf from nemo.collections.asr.models import EncDecCTCModel @@ -136,3 +138,108 @@ def test_save_restore_from_nemo_file(self, asr_model): w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() assert np.array_equal(w1, w2) + + @pytest.mark.unit + def test_save_restore_from_nemo_file_with_override(self, asr_model): + """" Test makes sure that the second instance created from the same configuration AND checkpoint + has the same weights. """ + + with tempfile.NamedTemporaryFile() as fp, tempfile.NamedTemporaryFile(mode='a+') as conf_fp: + filename = fp.name + + # Save model (with random artifact). + with tempfile.NamedTemporaryFile() as artifact: + asr_model.register_artifact(config_path=None, src=artifact.name) + asr_model.save_to(save_path=filename) + + # Modify config slightly + cfg = asr_model.cfg + cfg.encoder.params.activation = 'swish' + yaml_cfg = OmegaConf.to_yaml(cfg) + conf_fp.write(yaml_cfg) + conf_fp.seek(0) + + # Restore the model. + asr_model2 = EncDecCTCModel.restore_from(restore_path=filename, override_config_path=conf_fp.name) + + assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary) + assert asr_model.num_weights == asr_model2.num_weights + + w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + + assert np.array_equal(w1, w2) + + assert asr_model2.cfg.encoder.params.activation == 'swish' + + @pytest.mark.unit + def test_save_model_level_pt_ckpt(self, asr_model): + with tempfile.TemporaryDirectory() as ckpt_dir: + nemo_file = os.path.join(ckpt_dir, 'asr.nemo') + asr_model.save_to(nemo_file) + + # Save model level PT checkpoint + asr_model.extract_state_dict_from(nemo_file, ckpt_dir) + ckpt_path = os.path.join(ckpt_dir, 'model_weights.ckpt') + + assert os.path.exists(ckpt_path) + + # Restore the model. + asr_model2 = EncDecCTCModel.restore_from(restore_path=nemo_file) + + assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary) + assert asr_model.num_weights == asr_model2.num_weights + + # Change weights values + asr_model2.encoder.encoder[0].mconv[0].conv.weight.data += 1.0 + + w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + + assert not np.array_equal(w1, w2) + + # Restore from checkpoint + asr_model2.load_state_dict(torch.load(ckpt_path)) + + w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + + assert np.array_equal(w1, w2) + + @pytest.mark.unit + def test_save_module_level_pt_ckpt(self, asr_model): + with tempfile.TemporaryDirectory() as ckpt_dir: + nemo_file = os.path.join(ckpt_dir, 'asr.nemo') + asr_model.save_to(nemo_file) + + # Save model level PT checkpoint + asr_model.extract_state_dict_from(nemo_file, ckpt_dir, split_by_module=True) + encoder_path = os.path.join(ckpt_dir, 'encoder.ckpt') + decoder_path = os.path.join(ckpt_dir, 'decoder.ckpt') + preprocessor_path = os.path.join(ckpt_dir, 'preprocessor.ckpt') + + assert os.path.exists(encoder_path) + assert os.path.exists(decoder_path) + assert os.path.exists(preprocessor_path) + + # Restore the model. + asr_model2 = EncDecCTCModel.restore_from(restore_path=nemo_file) + + assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary) + assert asr_model.num_weights == asr_model2.num_weights + + # Change weights values + asr_model2.encoder.encoder[0].mconv[0].conv.weight.data += 1.0 + + w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + + assert not np.array_equal(w1, w2) + + # Restore from checkpoint + asr_model2.encoder.load_state_dict(torch.load(encoder_path)) + + w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() + + assert np.array_equal(w1, w2) From 3fc0e56fe931202b5cca428e796bd2849b7e7d13 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Mon, 31 Aug 2020 11:16:39 -0700 Subject: [PATCH 08/12] readme update, links changed to main (#1093) * readme update, updated links with main branch Signed-off-by: ekmb * jenkins Signed-off-by: ekmb --- Jenkinsfile | 3 +- README.rst | 2 ++ docs/source/nlp/models.rst | 33 +++++++------------ .../token_classification_model.py | 1 + .../nlp/Punctuation_and_Capitalization.ipynb | 6 ++-- 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 87eb554663be..525afc3e2bb8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -453,7 +453,8 @@ pipeline { model.dataset.data_dir=/home/TestData/nlp/glue_fake/MRPC \ trainer.gpus=[0] \ +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/nlp/glue_benchmark/mrpc' + exp_manager.exp_dir=examples/nlp/glue_benchmark/mrpc \ + model.output_dir=examples/nlp/glue_benchmark/mrpc' sh 'rm -rf examples/nlp/glue_benchmark/mrpc' } } diff --git a/README.rst b/README.rst index d0e361f498f0..9ab52fd967c4 100644 --- a/README.rst +++ b/README.rst @@ -99,6 +99,8 @@ To run tutorials: - Online Noise Augmentation - `05_Online_Noise_Augmentation.ipynb `_ * - NLP + - Token Classification (Named Entity Recognition) + - `Token_Classification_Named_Entity_Recognition_tutorial.ipynb `_ - Punctuation and Capitialization - `Punctuation_and_Capitalization.ipynb `_ diff --git a/docs/source/nlp/models.rst b/docs/source/nlp/models.rst index 2f655c76c650..6fabb62e88da 100644 --- a/docs/source/nlp/models.rst +++ b/docs/source/nlp/models.rst @@ -1,24 +1,15 @@ Models ====== -Currently, NeMo's NLP collection supports the following models: - -Supported Tasks and Models: - -BERT pretraining ----------------- -GLUE Benchmark --------------- -Intent Detection and Slot Filling ---------------------------------- -Text Classification -------------------- -Name Entity Recognition (NER) ------------------------------ -Punctuation and Capitalization ------------------------------- -Question Answering ------------------- - -Scripts for running these models, can be found under ``/example/nlp/``. -NLP tutorial are located under ``/tutorials/nlp/``. \ No newline at end of file +NeMo's NLP collection supports the following models: + +* BERT pretraining +* GLUE Benchmark +* Intent Detection and Slot Filling +* Text Classification +* Name Entity Recognition (NER) +* Punctuation and Capitalization +* Question Answering + +Scripts for running these models, could be found under ``NeMo/example/nlp/``. +NLP tutorials are located under ``NeMo/tutorials/nlp/``. \ No newline at end of file diff --git a/nemo/collections/nlp/models/token_classification/token_classification_model.py b/nemo/collections/nlp/models/token_classification/token_classification_model.py index 8f58fcb69345..d182085f5e7a 100644 --- a/nemo/collections/nlp/models/token_classification/token_classification_model.py +++ b/nemo/collections/nlp/models/token_classification/token_classification_model.py @@ -373,6 +373,7 @@ def evaluate_from_file( """ Run inference on data from a file, plot confusion matrix and calculate classification report. Use this method for final evaluation. + Args: output_dir: path to output directory to store model predictions, confusion matrix plot (if set to True) text_file: path to file with text. Each line of the text.txt file contains text sequences, where words diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb index aeeef6719c62..bab2879c61a2 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb @@ -34,7 +34,7 @@ "colab": {} }, "source": [ - "BRANCH = 'candidate'" + "BRANCH = 'main'" ], "execution_count": null, "outputs": [] @@ -191,7 +191,7 @@ "colab_type": "text" }, "source": [ - "In this notebook we are going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng) this script will download and preprocess the Tatoeba data [NeMo/examples/nlp/token_classification/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/examples/nlp/token_classification/get_tatoeba_data.py). Note, for further experiments with the model, set NUM_SAMPLES=-1 and consider including other datasets to improve model performance. \n" + "In this notebook we are going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng) this script will download and preprocess the Tatoeba data `NeMo/examples/nlp/token_classification/get_tatoeba_data.py`. Note, for further experiments with the model, set NUM_SAMPLES=-1 and consider including other datasets to improve model performance. \n" ] }, { @@ -227,7 +227,7 @@ "os.makedirs(WORK_DIR, exist_ok=True)\n", "if not os.path.exists(WORK_DIR + '/get_tatoeba_data.py'):\n", " print('Downloading get_tatoeba_data.py...')\n", - " wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/nlp/token_classification/data/get_tatoeba_data.py', WORK_DIR)\n", + " wget.download('https://raw.githubusercontent.com/NVIDIA/NeMo/' + BRANCH + '/examples/nlp/token_classification/data/get_tatoeba_data.py', WORK_DIR)\n", "else:\n", " print ('get_tatoeba_data.py is already exists')" ], From 774c36af71e12ab044f7c59c41276a65a16397c8 Mon Sep 17 00:00:00 2001 From: Jason Date: Mon, 31 Aug 2020 15:28:57 -0400 Subject: [PATCH 09/12] Update AudioToMel (#1092) * update asr module test Signed-off-by: Jason * increase tolerance Signed-off-by: Jason Co-authored-by: Oleksii Kuchaiev --- nemo/collections/asr/parts/features.py | 21 ++++++++++++++------- tests/collections/asr/test_asr_modules.py | 20 +++++++++++++------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/nemo/collections/asr/parts/features.py b/nemo/collections/asr/parts/features.py index a01af40789db..2877b98bd689 100644 --- a/nemo/collections/asr/parts/features.py +++ b/nemo/collections/asr/parts/features.py @@ -51,6 +51,11 @@ def normalize_batch(x, seq_len, normalize_type): x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device) x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device) for i in range(x.shape[0]): + if x[i, :, : seq_len[i]].shape[1] == 1: + raise ValueError( + "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result " + "in torch.std() returning nan" + ) x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1) x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1) # make sure x_std is not zero @@ -225,8 +230,7 @@ def __init__( highfreq = highfreq or sample_rate / 2 filterbanks = torch.tensor( - librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq,), - dtype=torch.float, + librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float, ).unsqueeze(0) # self.fb = filterbanks # self.window = window_tensor @@ -296,15 +300,18 @@ def forward(self, x, seq_len): # do preemphasis if self.preemph is not None: - x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1,) + x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1) - x = self.stft(x) + with torch.cuda.amp.autocast(enabled=False): + x = self.stft(x) + + # torch returns real, imag; so convert to magnitude + if not self.stft_conv: + x = torch.sqrt(x.pow(2).sum(-1)) # get power spectrum if self.mag_power != 1.0: x = x.pow(self.mag_power) - if not self.stft_conv: - x = x.sum(-1) # dot with filterbank energies x = torch.matmul(self.fb.to(x.dtype), x) @@ -331,7 +338,7 @@ def forward(self, x, seq_len): max_len = x.size(-1) mask = torch.arange(max_len).to(x.device) mask = mask.expand(x.size(0), max_len) >= seq_len.unsqueeze(1) - x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value,) + x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value) del mask pad_to = self.pad_to if not self.training: diff --git a/tests/collections/asr/test_asr_modules.py b/tests/collections/asr/test_asr_modules.py index 88b373116d8b..21f7b4c2c8d3 100644 --- a/tests/collections/asr/test_asr_modules.py +++ b/tests/collections/asr/test_asr_modules.py @@ -23,14 +23,20 @@ class TestASRModulesBasicTests: def test_AudioToMelSpectrogramPreprocessor(self): # Make sure constructor works instance1 = modules.AudioToMelSpectrogramPreprocessor(dither=0) - assert isinstance(instance1, modules.AudioToMelSpectrogramPreprocessor) + instance2 = modules.AudioToMelSpectrogramPreprocessor(dither=0, stft_conv=True) # Make sure forward doesn't throw with expected input input_signal = torch.randn(size=(4, 512)) - length = torch.randint(low=4, high=444, size=[4]) - res = instance1(input_signal=input_signal, length=length) - assert isinstance(res, tuple) - assert len(res) == 2 + length = torch.randint(low=161, high=500, size=[4]) + res1, length1 = instance1(input_signal=input_signal, length=length) + res2, length2 = instance2(input_signal=input_signal, length=length) + for len1, len2 in zip(length1, length2): + assert len1 == len2 + assert res1.shape == res2.shape + diff = torch.mean(torch.abs(res1 - res2)) + assert diff <= 3e-3 + diff = torch.max(torch.abs(res1 - res2)) + assert diff <= 1 @pytest.mark.unit def test_SpectrogramAugmentationr(self): @@ -41,7 +47,7 @@ def test_SpectrogramAugmentationr(self): # Make sure forward doesn't throw with expected input instance0 = modules.AudioToMelSpectrogramPreprocessor(dither=0) input_signal = torch.randn(size=(4, 512)) - length = torch.randint(low=4, high=444, size=[4]) + length = torch.randint(low=161, high=500, size=[4]) res0 = instance0(input_signal=input_signal, length=length) res = instance1(input_spec=res0[0]) @@ -57,7 +63,7 @@ def test_CropOrPadSpectrogramAugmentation(self): # Make sure forward doesn't throw with expected input instance0 = modules.AudioToMelSpectrogramPreprocessor(dither=0) input_signal = torch.randn(size=(4, 512)) - length = torch.randint(low=4, high=444, size=[4]) + length = torch.randint(low=161, high=500, size=[4]) res0 = instance0(input_signal=input_signal, length=length) res, new_length = instance1(input_signal=res0[0], length=length) From 2812fa6a1c7221b9f41d9ac3fc3bf2216263d65d Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Mon, 31 Aug 2020 13:14:56 -0700 Subject: [PATCH 10/12] Fixes: (#1084) * fix tests to pass when torch is built without cuda (e.g. macbook) * remove @experimental from several ASR modules and models * in tutorials switch branch candidate->main and clean outputs Signed-off-by: Oleksii Kuchaiev --- .../asr/models/classification_models.py | 3 - nemo/collections/asr/models/ctc_bpe_models.py | 1 - nemo/collections/asr/models/ctc_models.py | 3 - nemo/collections/asr/models/label_models.py | 1 - .../asr/modules/audio_preprocessing.py | 1 - nemo/collections/asr/modules/conv_asr.py | 3 - tests/collections/nlp/test_huggingface.py | 4 +- tutorials/asr/01_ASR_with_NeMo.ipynb | 2114 +++++----- tutorials/asr/02_Speech_Commands.ipynb | 3525 +++++++++-------- .../asr/05_Online_Noise_Augmentation.ipynb | 2726 ++++++------- 10 files changed, 4061 insertions(+), 4320 deletions(-) diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py index f586d440bb16..e94e4febbea3 100644 --- a/nemo/collections/asr/models/classification_models.py +++ b/nemo/collections/asr/models/classification_models.py @@ -28,12 +28,10 @@ from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types import * from nemo.utils import logging -from nemo.utils.decorators import experimental __all__ = ['EncDecClassificationModel', 'MatchboxNet'] -@experimental class EncDecClassificationModel(ASRModel): """Encoder decoder CTC-based models.""" @@ -305,6 +303,5 @@ def change_labels(self, new_labels: List[str]): logging.info(f"Changed decoder output to {self.decoder.num_classes} labels.") -@experimental class MatchboxNet(EncDecClassificationModel): pass diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index e687765d7612..92a4f5add7f8 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -31,7 +31,6 @@ __all__ = ['EncDecCTCModelBPE', 'JasperNetBPE', 'QuartzNetBPE'] -@experimental class EncDecCTCModelBPE(EncDecCTCModel): """Encoder decoder CTC-based models with Byte Pair Encoding.""" diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 9c7cfc67976b..8141e0b0c25e 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -35,7 +35,6 @@ __all__ = ['EncDecCTCModel', 'JasperNet', 'QuartzNet'] -@experimental class EncDecCTCModel(ASRModel): """Base class for encoder decoder CTC-based models.""" @@ -363,11 +362,9 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo return temporary_datalayer -@experimental class JasperNet(EncDecCTCModel): pass -@experimental class QuartzNet(EncDecCTCModel): pass diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index 0a0e3ebdf69f..e6eb1961a0c5 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -35,7 +35,6 @@ __all__ = ['EncDecSpeakerLabelModel', 'ExtractSpeakerEmbeddingsModel'] -@experimental class EncDecSpeakerLabelModel(ModelPT): """Encoder decoder class for speaker label models. Model class creates training, validation methods for setting up data diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py index ec2ee52a8eff..9bfb126dcb54 100644 --- a/nemo/collections/asr/modules/audio_preprocessing.py +++ b/nemo/collections/asr/modules/audio_preprocessing.py @@ -247,7 +247,6 @@ def filter_banks(self): return self.featurizer.filter_banks -@experimental class AudioToMFCCPreprocessor(AudioPreprocessor): """Preprocessor that converts wavs to MFCCs. Uses torchaudio.transforms.MFCC. diff --git a/nemo/collections/asr/modules/conv_asr.py b/nemo/collections/asr/modules/conv_asr.py index 242b6c506392..28b3431d5ce6 100644 --- a/nemo/collections/asr/modules/conv_asr.py +++ b/nemo/collections/asr/modules/conv_asr.py @@ -41,7 +41,6 @@ __all__ = ['ConvASRDecoder', 'ConvASREncoder', 'ConvASRDecoderClassification'] -@experimental class ConvASREncoder(NeuralModule, Exportable): """ Convolutional encoder for ASR models. With this class you can implement JasperNet and QuartzNet models. @@ -187,7 +186,6 @@ def forward(self, audio_signal, length=None): return s_input[-1], length -@experimental class ConvASRDecoder(NeuralModule, Exportable): """Simple ASR Decoder for use with CTC-based models such as JasperNet and QuartzNet @@ -264,7 +262,6 @@ def num_classes_with_blank(self): return self._num_classes -@experimental class ConvASRDecoderClassification(NeuralModule): """Simple ASR Decoder for use with classification models such as JasperNet and QuartzNet diff --git a/tests/collections/nlp/test_huggingface.py b/tests/collections/nlp/test_huggingface.py index 0c356eb33c84..d17906563085 100644 --- a/tests/collections/nlp/test_huggingface.py +++ b/tests/collections/nlp/test_huggingface.py @@ -17,6 +17,7 @@ from unittest import TestCase import pytest +import torch import nemo.collections.nlp as nemo_nlp @@ -26,7 +27,8 @@ def do_export(model, name: str): # Generate filename in the temporary directory. tmp_file_name = os.path.join(tmpdir, name + '.onnx') # Test export. - model = model.cuda() + if torch.cuda.is_available(): + model = model.cuda() model.export(tmp_file_name) diff --git a/tutorials/asr/01_ASR_with_NeMo.ipynb b/tutorials/asr/01_ASR_with_NeMo.ipynb index 1b5dd35baff4..c71c92490471 100644 --- a/tutorials/asr/01_ASR_with_NeMo.ipynb +++ b/tutorials/asr/01_ASR_with_NeMo.ipynb @@ -1,1061 +1,1059 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - }, - "colab": { - "name": "ASR_with_NeMo.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lJz6FDU1lRzc" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "\n", + "## Install dependencies\n", + "!pip install wget\n", + "!apt-get install sox libsndfile1 ffmpeg\n", + "!pip install unidecode\n", + "\n", + "## Install NeMo\n", + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]\n", + "\n", + "## Grab the config we'll use in this example\n", + "!mkdir configs\n", + "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/config.yaml" + ] }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "lJz6FDU1lRzc", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell.\n", - "\n", - "## Install dependencies\n", - "!pip install wget\n", - "!apt-get install sox libsndfile1 ffmpeg\n", - "!pip install unidecode\n", - "\n", - "## Install NeMo\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@01d4a6eb5f4178bd466c0646c7a086e76c4fb11e#egg=nemo_toolkit[all]\n", - "\n", - "## Grab the config we'll use in this example\n", - "!mkdir configs\n", - "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/asr/conf/config.yaml" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v1Jk9etFlRzf", - "colab_type": "text" - }, - "source": [ - "# Introduction to End-To-End Automatic Speech Recognition\n", - "\n", - "This notebook contains a basic tutorial of Automatic Speech Recognition (ASR) concepts, introduced with code snippets using the [NeMo framework](https://github.com/NVIDIA/NeMo).\n", - "We will first introduce the basics of the main concepts behind speech recognition, then explore concrete examples of what the data looks like and walk through putting together a simple end-to-end ASR pipeline.\n", - "\n", - "We assume that you are familiar with general machine learning concepts and can follow Python code, and we'll be using the [AN4 dataset from CMU](http://www.speech.cs.cmu.edu/databases/an4/) (with processing using `sox`)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YLln3U-IlRzg", - "colab_type": "text" - }, - "source": [ - "## Conceptual Overview: What is ASR?\n", - "\n", - "ASR, or **Automatic Speech Recognition**, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text). Our goal is usually to have a model that minimizes the **Word Error Rate (WER)** metric when transcribing speech input. In other words, given some audio file (e.g. a WAV file) containing speech, how do we transform this into the corresponding text with as few errors as possible?\n", - "\n", - "Traditional speech recognition takes a generative approach, modeling the full pipeline of how speech sounds are produced: from a **language model** that encapsulates likely orderings of words (e.g. an n-gram model), to a **pronunciation model** for each word in the vocabulary (e.g. a pronunciation table), to an **acoustic model** that translates the pronunciations to audio waveforms (e.g. a Gaussian Mixture Model), and so on.\n", - "\n", - "Then, if we receive some spoken input, our goal would be to find the most likely sequence of text that would result in the given audio according to our pipeline of models. Overall, with traditional speech recognition, we try to model `Pr(audio|transcript)*Pr(transcript)`, and take the argmax of this over possible transcripts.\n", - "\n", - "Over time, neural nets advanced to the point where each component of the traditional speech recognition model could be replaced by a neural model that had better performance and that had a greater potential for generalization. For example, we could replace an n-gram model with a neural language model, and replace a pronunciation table with a neural pronunciation model, and so on. However, each of these neural models need to be trained individually on different tasks, and errors in any model in the pipeline could throw off the whole prediction.\n", - "\n", - "Thus, we can see the appeal of **end-to-end ASR architectures**--discriminative models that simply take an audio input and give a textual output, and in which all components are trained together towards the same goal. A much easier pipeline to handle!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0S5iZPMSlRzg", - "colab_type": "text" - }, - "source": [ - "### End-To-End ASR\n", - "\n", - "With an end-to-end model, we want to directly learn `Pr(transcript|audio)` in order to predict the transcripts from the original audio. Since we are dealing with sequential information--audio data over time that corresponds to a sequence of letters--RNNs are the obvious choice. But now we have a pressing problem to deal with: since our input sequence (number of audio timesteps) is not the same length as our desired output (transcript length), how do we match each time step from the audio data to the correct output characters?\n", - "\n", - "Earlier speech recognition approaches relied on **temporally-aligned data**, in which each segment of time in an audio file was matched up to a corresponding speech sound such as a phoneme or word. However, if we would like to have the flexibility to predict letter-by-letter to prevent OOV (out of vocabulary) issues, then each time step in the data would have to be labeled with the letter sound that the speaker is making at that point in the audio file. With that information, it seems like we should simply be able to try to predict the correct letter for each time step and then collapse the repeated letters (e.g. the prediction output `LLLAAAAPPTOOOPPPP` would become `LAPTOP`). It turns out that this idea has some problems: not only does alignment make the dataset incredibly labor-intensive to label, but also, what do we do with words like \"book\" that contain consecutive repeated letters? Simply squashing repeated letters together would not work in that case!\n", - "\n", - "![Alignment example](https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/asr/notebooks/images/alignment_example.png)\n", - "\n", - "Modern end-to-end approaches get around this using methods that don't require manual alignment at all, so that the input-output pairs are really just the raw audio and the transcript--no extra data or labeling required. Let's briefly go over two popular approaches that allow us to do this, Connectionist Temporal Classification (CTC) and sequence-to-sequence models with attention.\n", - "\n", - "#### Connectionist Temporal Classification (CTC)\n", - "\n", - "In normal speech recognition prediction output, we would expect to have characters such as the letters from A through Z, numbers 0 through 9, spaces (\"\\_\"), and so on. CTC introduces a new intermediate output token called the **blank token** (\"-\") that is useful for getting around the alignment issue.\n", - "\n", - "With CTC, we still predict one token per time segment of speech, but we use the blank token to figure out where we can and can't collapse the predictions. The appearance of a blank token helps separate repeating letters that should not be collapsed. For instance, with an audio snippet segmented into `T=11` time steps, we could get predictions that look like `BOO-OOO--KK`, which would then collapse to `\"BO-O-K\"`, and then we would remove the blank tokens to get our final output, `BOOK`.\n", - "\n", - "Now, we can predict one output token per time step, then collapse and clean to get sensible output without any fear of ambiguity from repeating letters! A simple way of getting predictions like this would be to apply a bidirectional RNN to the audio input, apply softmax over each time step's output, and then take the token with the highest probability. The method of always taking the best token at each time step is called **greedy decoding, or max decoding**.\n", - "\n", - "To calculate our loss for backprop, we would like to know the log probability of the model producing the correct transcript, `log(Pr(transcript|audio))`. We can get the log probability of a single intermediate output sequence (e.g. `BOO-OOO--KK`) by summing over the log probabilities we get from each token's softmax value, but note that the resulting sum is different from the log probability of the transcript itself (`BOOK`). This is because there are multiple possible output sequences of the same length that can be collapsed to get the same transcript (e.g. `BBO--OO-KKK` also results in `BOOK`), and so we need to **marginalize over every valid sequence of length `T` that collapses to the transcript**.\n", - "\n", - "Therefore, to get our transcript's log probability given our audio input, we must sum the log probabilities of every sequence of length `T` that collapses to the transcript (e.g. `log(Pr(output: \"BOOK\"|audio)) = log(Pr(BOO-OOO--KK|audio)) + log(Pr(BBO--OO-KKK|audio)) + ...`). In practice, we can use a dynamic programming approach to calculate this, accumulating our log probabilities over different \"paths\" through the softmax outputs at each time step.\n", - "\n", - "If you would like a more in-depth explanation of how CTC works, or how we can improve our results by using a modified beam search algorithm, feel free to check out the Further Reading section at the end of this notebook for more resources.\n", - "\n", - "#### Sequence-to-Sequence with Attention\n", - "\n", - "One problem with CTC is that predictions at different time steps are conditionally independent, which is an issue because the words in a continuous utterance tend to be related to each other in some sensible way. With this conditional independence assumption, we can't learn a language model that can represent such dependencies, though we can add a language model on top of the CTC output to mitigate this to some degree.\n", - "\n", - "A popular alternative is to use a sequence-to-sequence model with attention. A typical seq2seq model for ASR consists of some sort of **bidirectional RNN encoder** that consumes the audio sequence timestep-by-timestep, and where the outputs are then passed to an **attention-based decoder**. Each prediction from the decoder is based on attending to some parts of the entire encoded input, as well as the previously outputted tokens.\n", - "\n", - "The outputs of the decoder can be anything from word pieces to phonemes to letters, and since predictions are not directly tied to time steps of the input, we can just continue producing tokens one-by-one until an end token is given (or we reach a specified max output length). This way, we do not need to deal with audio alignment, and our predicted transcript is just the sequence of outputs given by our decoder.\n", - "\n", - "Now that we have an idea of what some popular end-to-end ASR models look like, let's take a look at the audio data we'll be working with for our example." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "38aYTCTIlRzh", - "colab_type": "text" - }, - "source": [ - "## Taking a Look at Our Data (AN4)\n", - "\n", - "The AN4 dataset, also known as the Alphanumeric dataset, was collected and published by Carnegie Mellon University. It consists of recordings of people spelling out addresses, names, telephone numbers, etc., one letter or number at a time, as well as their corresponding transcripts. We choose to use AN4 for this tutorial because it is relatively small, with 948 training and 130 test utterances, and so it trains quickly.\n", - "\n", - "Before we get started, let's download and prepare the dataset. The utterances are available as `.sph` files, so we will need to convert them to `.wav` for later processing. Please make sure you have [Sox](http://sox.sourceforge.net/) installed for this step (see the \"Downloads\" section of the main page)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gAhsmi6HlRzh", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# This is where the an4/ directory will be placed.\n", - "# Change this if you don't want the data to be extracted in the current directory.\n", - "data_dir = '.'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "scrolled": true, - "id": "Yb4fuUvWlRzk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import glob\n", - "import os\n", - "import subprocess\n", - "import tarfile\n", - "import wget\n", - "\n", - "# Download the dataset. This will take a few moments...\n", - "print(\"******\")\n", - "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n", - " an4_path = wget.download(an4_url, data_dir)\n", - " print(f\"Dataset downloaded at: {an4_path}\")\n", - "else:\n", - " print(\"Tarfile already exists.\")\n", - " an4_path = data_dir + '/an4_sphere.tar.gz'\n", - "\n", - "if not os.path.exists(data_dir + '/an4/'):\n", - " # Untar and convert .sph to .wav (using sox)\n", - " tar = tarfile.open(an4_path)\n", - " tar.extractall(path=data_dir)\n", - "\n", - " print(\"Converting .sph to .wav...\")\n", - " sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)\n", - " for sph_path in sph_list:\n", - " wav_path = sph_path[:-4] + '.wav'\n", - " cmd = [\"sox\", sph_path, wav_path]\n", - " subprocess.run(cmd)\n", - "print(\"Finished conversion.\\n******\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m_LFeM0elRzm", - "colab_type": "text" - }, - "source": [ - "You should now have a folder called `an4` that contains `etc/an4_train.transcription`, `etc/an4_test.transcription`, audio files in `wav/an4_clstk` and `wav/an4test_clstk`, along with some other files we will not need.\n", - "\n", - "Now we can load and take a look at the data. As an example, file `cen2-mgah-b.wav` is a 2.6 second-long audio recording of a man saying the letters \"G L E N N\" one-by-one (feel free to check this out by listening to `./an4/wav/an4_clstk/mgah/cen2-mgah-b.wav`). In an ASR task, the WAV file would be our input, and \"G L E N N\" would be our desired output.\n", - "\n", - "Let's plot the waveform, which is simply a line plot of the sequence of values that we read from the file. This is a format of viewing audio that you are likely to be familiar with seeing in many audio editors and visualizers:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "MqIAKkqelRzm", - "colab_type": "code", - "colab": {} - }, - "source": [ - "%matplotlib inline\n", - "import librosa\n", - "import librosa.display\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "\n", - "# Plot our example audio file's waveform\n", - "example_file = data_dir + '/an4/wav/an4_clstk/mgah/cen2-mgah-b.wav'\n", - "audio, sample_rate = librosa.load(example_file)\n", - "\n", - "plt.rcParams['figure.figsize'] = (15,7)\n", - "plt.title('Waveform of Audio Example')\n", - "plt.ylabel('Amplitude')\n", - "\n", - "_ = librosa.display.waveplot(audio)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gg6RR_yolRzo", - "colab_type": "text" - }, - "source": [ - "We can see the activity in the waveform that corresponds to each letter in the audio, as our speaker here enunciates quite clearly!\n", - "You can kind of tell that each spoken letter has a different \"shape,\" and it's interesting to note that last two blobs look relatively similar, which is expected because they are both the letter \"N.\"\n", - "\n", - "### Spectrograms and Mel Spectrograms\n", - "\n", - "However, since audio information is more useful in the context of frequencies of sound over time, we can get a better representation than this raw sequence of 57,330 values.\n", - "We can apply a [Fourier Transform](https://en.wikipedia.org/wiki/Fourier_transform) on our audio signal to get something more useful: a **spectrogram**, which is a representation of the energy levels (i.e. amplitude, or \"loudness\") of each frequency (i.e. pitch) of the signal over the duration of the file.\n", - "A spectrogram (which can be viewed as a heat map) is a good way of seeing how the *strengths of various frequencies in the audio vary over time*, and is obtained by breaking up the signal into smaller, usually overlapping chunks and performing a Short-Time Fourier Transform (STFT) on each.\n", - "\n", - "Let's examine what the spectrogram of our sample looks like." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oCFneEs1lRzp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Get spectrogram using Librosa's Short-Time Fourier Transform (stft)\n", - "spec = np.abs(librosa.stft(audio))\n", - "spec_db = librosa.amplitude_to_db(spec, ref=np.max) # Decibels\n", - "\n", - "# Use log scale to view frequencies\n", - "librosa.display.specshow(spec_db, y_axis='log', x_axis='time')\n", - "plt.colorbar()\n", - "plt.title('Audio Spectrogram');" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9OPc4tcalRzs", - "colab_type": "text" - }, - "source": [ - "Again, we are able to see each letter being pronounced, and that the last two blobs that correspond to the \"N\"s are pretty similar-looking. But how do we interpret these shapes and colors? Just as in the waveform plot before, we see time passing on the x-axis (all 2.6s of audio). But now, the y-axis represents different frequencies (on a log scale), and *the color on the plot shows the strength of a frequency at a particular point in time*.\n", - "\n", - "We're still not done yet, as we can make one more potentially useful tweak: using the **Mel Spectrogram** instead of the normal spectrogram. This is simply a change in the frequency scale that we use from linear (or logarithmic) to the mel scale, which is \"a perceptual scale of pitches judged by listeners to be equal in distance from one another\" (from [Wikipedia](https://en.wikipedia.org/wiki/Mel_scale)).\n", - "\n", - "In other words, it's a transformation of the frequencies to be more aligned to what humans perceive; a change of +1000Hz from 2000Hz->3000Hz sounds like a larger difference to us than 9000Hz->10000Hz does, so the mel scale normalizes this such that equal distances sound like equal differences to the human ear. Intuitively, we use the mel spectrogram because in this case we are processing and transcribing human speech, such that transforming the scale to better match what we hear is a useful procedure." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7yQXVn-TlRzt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Plot the mel spectrogram of our sample\n", - "mel_spec = librosa.feature.melspectrogram(audio, sr=sample_rate)\n", - "mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)\n", - "\n", - "librosa.display.specshow(\n", - " mel_spec_db, x_axis='time', y_axis='mel')\n", - "plt.colorbar()\n", - "plt.title('Mel Spectrogram');" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RSCyVizDlRz1", - "colab_type": "text" - }, - "source": [ - "## Convolutional ASR Models\n", - "\n", - "Let's take a look at the model that we will be building, and how we specify its parameters.\n", - "\n", - "### The Jasper Model\n", - "\n", - "We will be training a small [Jasper (Just Another SPeech Recognizer) model](https://arxiv.org/abs/1904.03288) from scratch (e.g. initialized randomly). \n", - "In brief, Jasper architectures consist of a repeated block structure that utilizes 1D convolutions.\n", - "In a Jasper_KxR model, `R` sub-blocks (consisting of a 1D convolution, batch norm, ReLU, and dropout) are grouped into a single block, which is then repeated `K` times.\n", - "We also have a one extra block at the beginning and a few more at the end that are invariant of `K` and `R`, and we use CTC loss.\n", - "\n", - "### The QuartzNet Model\n", - "\n", - "The QuartzNet is better variant of Jasper with a key difference that it uses time-channel separable 1D convolutions. This allows it to dramatically reduce number of weights while keeping similar accuracy.\n", - "\n", - "A Jasper/QuartzNet models look like this (QuartzNet model is pictured):\n", - "\n", - "![QuartzNet with CTC](https://developer.nvidia.com/blog/wp-content/uploads/2020/05/quartznet-model-architecture-1-625x742.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gEpNci7slRzw", - "colab_type": "text" - }, - "source": [ - "# Using NeMo for Automatic Speech Recognition\n", - "\n", - "Now that we have an idea of what ASR is and how the audio data looks like, we can start using NeMo to do some ASR!\n", - "\n", - "We'll be using the **Neural Modules (NeMo) toolkit** for this part, so if you haven't already, you should download and install NeMo and its dependencies. To do so, just follow the directions on the [GitHub page](https://github.com/NVIDIA/NeMo), or in the [documentation](https://docs.nvidia.com/deeplearning/nemo/developer_guide/en/candidate/).\n", - "\n", - "NeMo lets us easily hook together the components (modules) of our model, such as the data layer, intermediate layers, and various losses, without worrying too much about implementation details of individual parts or connections between modules. NeMo also comes with complete models which only require your data and hyperparameters for training." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4_W0lhaQlRzx", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# NeMo's \"core\" package\n", - "import nemo\n", - "# NeMo's ASR collection - this collections contains complete ASR models and\n", - "# building blocks (modules) for ASR\n", - "import nemo.collections.asr as nemo_asr" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v_W8EbYktZE3", - "colab_type": "text" - }, - "source": [ - "## Using an Out-of-the-Box Model\n", - "\n", - "NeMo's ASR collection comes with many building blocks and even complete models that we can use for training and evaluation. Moreover, several models come with pre-trained weights. Let's instantiate a complete QuartzNet15x5 model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "KFZZpYult96G", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# This line will download pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud and instantiate it for you\n", - "quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"QuartzNet15x5Base-En\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KucxoFJhum0i", - "colab_type": "text" - }, - "source": [ - "Next, we'll simply add paths to files we want to transcribe into the list and pass it to our model. Note that it will work for relatively short (<25 seconds) files. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3QCpR_93u1hp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "files = ['./an4/wav/an4_clstk/mgah/cen2-mgah-b.wav']\n", - "for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):\n", - " print(f\"Audio in {fname} was recognized as: {transcription}\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ppUm_kuavm_f", - "colab_type": "text" - }, - "source": [ - "That was easy! But there are plenty of scenarios where you would want to fine-tune the model on your own data or even train from scratch. For example, this out-of-the box model will obviously not work for Spanish and would likely perform poorly for telephone audio. So if you have collected your own data, you certainly should attempt to fine-tune or train on it!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ABUDaC5Js7AW", - "colab_type": "text" - }, - "source": [ - "## Training from Scratch\n", - "\n", - "To train from scratch, you need to prepare your training data in the right format and specify your models architecture." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RdNyw1b_zgtm", - "colab_type": "text" - }, - "source": [ - "### Creating Data Manifests\n", - "\n", - "The first thing we need to do now is to create manifests for our training and evaluation data, which will contain the metadata of our audio files. NeMo data sets take in a standardized manifest format where each line corresponds to one sample of audio, such that the number of lines in a manifest is equal to the number of samples that are represented by that manifest. A line must contain the path to an audio file, the corresponding transcript (or path to a transcript file), and the duration of the audio sample.\n", - "\n", - "Here's an example of what one line in a NeMo-compatible manifest might look like:\n", - "```\n", - "{\"audio_filepath\": \"path/to/audio.wav\", \"duration\": 3.45, \"text\": \"this is a nemo tutorial\"}\n", - "```\n", - "\n", - "We can build our training and evaluation manifests using `an4/etc/an4_train.transcription` and `an4/etc/an4_test.transcription`, which have lines containing transcripts and their corresponding audio file IDs:\n", - "```\n", - "...\n", - " P I T T S B U R G H (cen5-fash-b)\n", - " TWO SIX EIGHT FOUR FOUR ONE EIGHT (cen7-fash-b)\n", - "...\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lVB1sG1GlRzz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# --- Building Manifest Files --- #\n", - "import json\n", - "\n", - "# Function to build a manifest\n", - "def build_manifest(transcripts_path, manifest_path, wav_path):\n", - " with open(transcripts_path, 'r') as fin:\n", - " with open(manifest_path, 'w') as fout:\n", - " for line in fin:\n", - " # Lines look like this:\n", - " # transcript (fileID)\n", - " transcript = line[: line.find('(')-1].lower()\n", - " transcript = transcript.replace('', '').replace('', '')\n", - " transcript = transcript.strip()\n", - "\n", - " file_id = line[line.find('(')+1 : -2] # e.g. \"cen4-fash-b\"\n", - " audio_path = os.path.join(\n", - " data_dir, wav_path,\n", - " file_id[file_id.find('-')+1 : file_id.rfind('-')],\n", - " file_id + '.wav')\n", - "\n", - " duration = librosa.core.get_duration(filename=audio_path)\n", - "\n", - " # Write the metadata to the manifest\n", - " metadata = {\n", - " \"audio_filepath\": audio_path,\n", - " \"duration\": duration,\n", - " \"text\": transcript\n", - " }\n", - " json.dump(metadata, fout)\n", - " fout.write('\\n')\n", - " \n", - "# Building Manifests\n", - "print(\"******\")\n", - "train_transcripts = data_dir + '/an4/etc/an4_train.transcription'\n", - "train_manifest = data_dir + '/an4/train_manifest.json'\n", - "if not os.path.isfile(train_manifest):\n", - " build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')\n", - " print(\"Training manifest created.\")\n", - "\n", - "test_transcripts = data_dir + '/an4/etc/an4_test.transcription'\n", - "test_manifest = data_dir + '/an4/test_manifest.json'\n", - "if not os.path.isfile(test_manifest):\n", - " build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n", - " print(\"Test manifest created.\")\n", - "print(\"***Done***\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W2fShQzRzo-M", - "colab_type": "text" - }, - "source": [ - "### Specifying Our Model with a YAML Config File\n", - "\n", - "For this tutorial, we'll build a *Jasper_4x1 model*, with `K=4` blocks of single (`R=1`) sub-blocks and a *greedy CTC decoder*, using the configuration found in `./configs/config.yaml`.\n", - "\n", - "If we open up this config file, we find model section which describes architecture of our model. A model contains an entry labeled `encoder`, with a field called `jasper` that contains a list with multiple entries. Each of the members in this list specifies one block in our model, and looks something like this:\n", - "```\n", - "- filters: 128\n", - " repeat: 1\n", - " kernel: [11]\n", - " stride: [2]\n", - " dilation: [1]\n", - " dropout: 0.2\n", - " residual: false\n", - " separable: true\n", - " se: true\n", - " se_context_size: -1\n", - "```\n", - "The first member of the list corresponds to the first block in the Jasper architecture diagram, which appears regardless of `K` and `R`.\n", - "Next, we have four entries that correspond to the `K=4` blocks, and each has `repeat: 1` since we are using `R=1`.\n", - "These are followed by two more entries for the blocks that appear at the end of our Jasper model before the CTC loss.\n", - "\n", - "There are also some entries at the top of the file that specify how we will handle training (`train_ds`) and validation (`validation_ds`) data.\n", - "\n", - "Using a YAML config such as this is helpful for getting a quick and human-readable overview of what your architecture looks like, and allows you to swap out model and run configurations easily without needing to change your code." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PXVKBniMlRz5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# --- Config Information ---#\n", - "from ruamel.yaml import YAML\n", - "config_path = './configs/config.yaml'\n", - "\n", - "yaml = YAML(typ='safe')\n", - "with open(config_path) as f:\n", - " params = yaml.load(f)\n", - "print(params)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wUmq3p2Aw_5N", - "colab_type": "text" - }, - "source": [ - "### Training with PyTorch Lightning\n", - "\n", - "NeMo models and modules can be used in any PyTorch code where torch.nn.Module is expected.\n", - "\n", - "However, NeMo's models are based on [PytorchLightning's](https://github.com/PyTorchLightning/pytorch-lightning) LightningModule and we recommend you use PytorchLightning for training and fine-tuning as it makes using mixed precision and distributed training very easy. So to start, let's create Trainer instance for training on GPU for 50 epochs" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GUfR6tAK0k2u", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import pytorch_lightning as pl\n", - "trainer = pl.Trainer(gpus=1, max_epochs=50)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IEn2RyvgxxvO", - "colab_type": "text" - }, - "source": [ - "Next, we instantiate and ASR model based on our ``config.yaml`` file from the previous section.\n", - "Note that this is a stage during which we also tell the model where our training and validation manifests are." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Cbf0fsMK09lk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from omegaconf import DictConfig\n", - "params['model']['train_ds']['manifest_filepath'] = train_manifest\n", - "params['model']['validation_ds']['manifest_filepath'] = test_manifest\n", - "first_asr_model = nemo_asr.models.EncDecCTCModel(cfg=DictConfig(params['model']), trainer=trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hWtzwL5qXTYq", - "colab_type": "text" - }, - "source": [ - "With that, we can start training with just one line!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "inRJsnrz1psq", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Start training!!!\n", - "trainer.fit(first_asr_model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jpYXX-GslR0E", - "colab_type": "text" - }, - "source": [ - "There we go! We've put together a full training pipeline for the model and trained it for 50 epochs.\n", - "\n", - "### After Training: Monitoring Progress and Changing Hyperparameters\n", - "We can now start Tensorboard to see how training went. Recall that WER stands for Word Error Rate and so the lower it is, the better." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "n_0y3stSXDX_", - "colab_type": "code", - "colab": {} - }, - "source": [ - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Z0h-BME7U8yb", - "colab_type": "text" - }, - "source": [ - "We could improve this model by playing with hyperparameters. We can look at the current hyperparameters with the following:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7kdQbpohXnEd", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(params['model']['optim'])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sGZzRCvIW8kE", - "colab_type": "text" - }, - "source": [ - "Let's say we wanted to change the learning rate. To do so, we can create a `new_opt` dict and set our desired learning rate, then call `.setup_optimization()` with the new optimization parameters." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AbigFKUtYgvn", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import copy\n", - "new_opt = copy.deepcopy(params['model']['optim'])\n", - "new_opt['lr'] = 0.001\n", - "first_asr_model.setup_optimization(optim_config=DictConfig(new_opt))\n", - "# And then you can invoke trainer.fit(first_asr_model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D5Kwg8Cz-aaO", - "colab_type": "text" - }, - "source": [ - "## Inference\n", - "\n", - "Let's have a quick look at how one could run inference with NeMo's ASR model.\n", - "\n", - "First, ``EncDecCTCModel`` and its subclasses contain a handy ``transcribe`` method which can be used to simply obtain audio files' transcriptions. It also has batch_size argument to improve performance." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3FT0klSV268p", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(first_asr_model.transcribe(paths2audio_files=['./an4/wav/an4_clstk/mgah/cen2-mgah-b.wav',\n", - " './an4/wav/an4_clstk/fmjd/cen7-fmjd-b.wav',\n", - " './an4/wav/an4_clstk/fmjd/cen8-fmjd-b.wav',\n", - " './an4/wav/an4_clstk/fkai/cen8-fkai-b.wav'],\n", - " batch_size=4))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6FiCfLX0D7py", - "colab_type": "text" - }, - "source": [ - "Below is an example of a simple inference loop in pure PyTorch. It also shows how one can compute Word Error Rate (WER) metric between predictions and references." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7mP4r1Gx_Ilt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Bigger batch-size = bigger throughput\n", - "params['model']['validation_ds']['batch_size'] = 16\n", - "\n", - "# Setup the test data loader and make sure the model is on GPU\n", - "first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])\n", - "first_asr_model.cuda()\n", - "\n", - "# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.\n", - "# WER is computed as numerator/denominator.\n", - "# We'll gather all the test batches' numerators and denominators.\n", - "wer_nums = []\n", - "wer_denoms = []\n", - "\n", - "# Loop over all test batches.\n", - "# Iterating over the model's `test_dataloader` will give us:\n", - "# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)\n", - "# See the AudioToCharDataset for more details.\n", - "for test_batch in first_asr_model.test_dataloader():\n", - " test_batch = [x.cuda() for x in test_batch]\n", - " targets = test_batch[2]\n", - " targets_lengths = test_batch[3] \n", - " log_probs, encoded_len, greedy_predictions = first_asr_model(\n", - " input_signal=test_batch[0], input_signal_length=test_batch[1]\n", - " )\n", - " # Notice the model has a helper object to compute WER\n", - " wer_num, wer_denom = first_asr_model._wer(greedy_predictions, targets, targets_lengths)\n", - " wer_nums.append(wer_num.detach().cpu().numpy())\n", - " wer_denoms.append(wer_denom.detach().cpu().numpy())\n", - "\n", - "# We need to sum all numerators and denominators first. Then divide.\n", - "print(f\"WER = {sum(wer_nums)/sum(wer_denoms)}\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0kM9kBNOCptf", - "colab_type": "text" - }, - "source": [ - "This WER is not particularly impressive and could be significantly improved. You could train longer (try 100 epochs) to get a better number. Check out the next section on how to improve it further." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RBcJtg5ulR0H", - "colab_type": "text" - }, - "source": [ - "## Model Improvements\n", - "\n", - "You already have all you need to create your own ASR model in NeMo, but there are a few more tricks that you can employ if you so desire. In this section, we'll briefly cover a few possibilities for improving an ASR model.\n", - "\n", - "### Data Augmentation\n", - "\n", - "There exist several ASR data augmentation methods that can increase the size of our training set.\n", - "\n", - "For example, we can perform augmentation on the spectrograms by zeroing out specific frequency segments (\"frequency masking\") or time segments (\"time masking\") as described by [SpecAugment](https://arxiv.org/abs/1904.08779), or zero out rectangles on the spectrogram as in [Cutout](https://arxiv.org/pdf/1708.04552.pdf). In NeMo, we can do all three of these by simply adding in a `SpectrogramAugmentation` neural module. (As of now, it does not perform the time warping from the SpecAugment paper.)\n", - "\n", - "Our toy model does not do spectrogram augmentation. But the real one we got from cloud does:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9glGogaPlR0H", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(quartznet._cfg['spec_augment'])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LdwdcA_a640R", - "colab_type": "text" - }, - "source": [ - "If you want to enable SpecAugment in your model, make sure your .yaml config file contains 'model/spec_augment' section which looks like the one above." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2f142kIQc1Z2", - "colab_type": "text" - }, - "source": [ - "### Transfer learning\n", - "\n", - "Transfer learning is an important machine learning technique that uses a model’s knowledge of one task to make it perform better on another. Fine-tuning is one of the techniques to perform transfer learning. It is an essential part of the recipe for many state-of-the-art results where a base model is first pretrained on a task with abundant training data and then fine-tuned on different tasks of interest where the training data is less abundant or even scarce.\n", - "\n", - "In ASR you might want to do fine-tuning in multiple scenarios, for example, when you want to improve your model's performance on a particular domain (medical, financial, etc.) or on accented speech. You can even transfer learn from one language to another! Check out [this paper](https://arxiv.org/abs/2005.04290) for examples.\n", - "\n", - "Transfer learning with NeMo is simple. Let's demonstrate how the model we got from the cloud could be fine-tuned on AN4 data. (NOTE: this is a toy example). And, while we are at it, we will change model's vocabulary, just to demonstrate how it's done." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "hl320dsydWX0", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Check what kind of vocabulary/alphabet the model has right now\n", - "print(quartznet.decoder.vocabulary)\n", - "\n", - "# Let's add \"!\" symbol there. Note that you can (and should!) change the vocabulary\n", - "# entirely when fine-tuning using a different language.\n", - "quartznet.change_vocabulary(\n", - " new_vocabulary=[\n", - " ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',\n", - " 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', \"'\", \"!\"\n", - " ]\n", - ")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "M7lvmiMSd3Aw", - "colab_type": "text" - }, - "source": [ - "After this, our decoder has completely changed, but our encoder (which is where most of the weights are) remained intact. Let's fine tune-this model for 2 epochs on AN4 dataset. We will also use the smaller learning rate from ``new_opt` (see the \"After Training\" section)`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_PZJIso-eDl-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Use the smaller learning rate we set before\n", - "quartznet.setup_optimization(optim_config=DictConfig(new_opt))\n", - "\n", - "# Point to the data we'll use for fine-tuning as the training set\n", - "quartznet.setup_training_data(train_data_config=params['model']['train_ds'])\n", - "\n", - "# Point to the new validation data for fine-tuning\n", - "quartznet.setup_validation_data(val_data_config=params['model']['validation_ds'])\n", - "\n", - "# And now we can create a PyTorch Lightning trainer and call `fit` again.\n", - "trainer = pl.Trainer(gpus=[1], max_epochs=2)\n", - "trainer.fit(quartznet)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VURa1NavlR0U", - "colab_type": "text" - }, - "source": [ - "### Fast Training\n", - "\n", - "Last but not least, we could simply speed up training our model! If you have the resources, you can speed up training by splitting the workload across multiple GPUs. Otherwise (or in addition), there's always mixed precision training, which allows you to increase your batch size.\n", - "\n", - "You can use [PyTorch Lightning's Trainer object](https://pytorch-lightning.readthedocs.io/en/latest/trainer.html) to handle mixed-precision and distributed training for you. Below are some examples of flags you would pass to the `Trainer` to use these features:\n", - "\n", - "```python\n", - "# Mixed precision:\n", - "trainer = pl.Trainer(amp_level='O1', precision=16)\n", - "\n", - "# Trainer with a distributed backend:\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", - "\n", - "# Of course, you can combine these flags as well.\n", - "```\n", - "\n", - "Finally, have a look at [example scripts in NeMo repository](https://github.com/NVIDIA/NeMo/blob/candidate/examples/asr/speech_to_text.py) which can handle mixed precision and distibuted training using command-line arguments." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wteGqroafWg1", - "colab_type": "text" - }, - "source": [ - "## Under the Hood\n", - "\n", - "NeMo is open-source and we do all our model development in the open, so you can inspect our code if you wish.\n", - "\n", - "In particular, ``nemo_asr.model.EncDecCTCModel`` is an encoder-decoder model which is constructed using several ``Neural Modules`` taken from ``nemo_asr.modules.`` Here is what its forward pass looks like:\n", - "```python\n", - "def forward(self, input_signal, input_signal_length):\n", - " processed_signal, processed_signal_len = self.preprocessor(\n", - " input_signal=input_signal, length=input_signal_length,\n", - " )\n", - " # Spec augment is not applied during evaluation/testing\n", - " if self.spec_augmentation is not None and self.training:\n", - " processed_signal = self.spec_augmentation(input_spec=processed_signal)\n", - " encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_len)\n", - " log_probs = self.decoder(encoder_output=encoded)\n", - " greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)\n", - " return log_probs, encoded_len, greedy_predictions\n", - "```\n", - "Here:\n", - "\n", - "* ``self.preprocessor`` is an instance of ``nemo_asr.modules.AudioToMelSpectrogramPreprocessor``, which is a neural module that takes audio signal and converts it into a Mel-Spectrogram\n", - "* ``self.spec_augmentation`` - is a neural module of type ```nemo_asr.modules.SpectrogramAugmentation``, which implements data augmentation. \n", - "* ``self.encoder`` - is a convolutional Jasper/QuartzNet-like encoder of type ``nemo_asr.modules.ConvASREncoder``\n", - "* ``self.decoder`` - is a ``nemo_asr.modules.ConvASRDecoder`` which simply projects into the target alphabet (vocabulary).\n", - "\n", - "Also, ``EncDecCTCModel`` uses the audio dataset class ``nemo_asr.data.AudioToCharDataset`` and CTC loss implemented in ``nemo_asr.losses.CTCLoss``.\n", - "\n", - "You can use these and other neural modules (or create new ones yourself!) to construct new ASR models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smzlvbhelR0U", - "colab_type": "text" - }, - "source": [ - "# Further Reading/Watching:\n", - "\n", - "That's all for now! If you'd like to learn more about the topics covered in this tutorial, here are some resources that may interest you:\n", - "- [Stanford Lecture on ASR](https://www.youtube.com/watch?v=3MjIkWxXigM)\n", - "- [\"An Intuitive Explanation of Connectionist Temporal Classification\"](https://towardsdatascience.com/intuitively-understanding-connectionist-temporal-classification-3797e43a86c)\n", - "- [Explanation of CTC with Prefix Beam Search](https://medium.com/corti-ai/ctc-networks-and-language-models-prefix-beam-search-explained-c11d1ee23306)\n", - "- [Listen Attend and Spell Paper (seq2seq ASR model)](https://arxiv.org/abs/1508.01211)\n", - "- [Explanation of the mel spectrogram in more depth](https://towardsdatascience.com/getting-to-know-the-mel-spectrogram-31bca3e2d9d0)\n", - "- [Jasper Paper](https://arxiv.org/abs/1904.03288)\n", - "- [QuartzNet paper](https://arxiv.org/abs/1910.10261)\n", - "- [SpecAugment Paper](https://arxiv.org/abs/1904.08779)\n", - "- [Explanation and visualization of SpecAugment](https://towardsdatascience.com/state-of-the-art-audio-data-augmentation-with-google-brains-specaugment-and-pytorch-d3d1a3ce291e)\n", - "- [Cutout Paper](https://arxiv.org/pdf/1708.04552.pdf)\n", - "- [Transfer Learning Blogpost](https://developer.nvidia.com/blog/jump-start-training-for-speech-recognition-models-with-nemo/)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V3ERGX86lR0V", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "v1Jk9etFlRzf" + }, + "source": [ + "# Introduction to End-To-End Automatic Speech Recognition\n", + "\n", + "This notebook contains a basic tutorial of Automatic Speech Recognition (ASR) concepts, introduced with code snippets using the [NeMo framework](https://github.com/NVIDIA/NeMo).\n", + "We will first introduce the basics of the main concepts behind speech recognition, then explore concrete examples of what the data looks like and walk through putting together a simple end-to-end ASR pipeline.\n", + "\n", + "We assume that you are familiar with general machine learning concepts and can follow Python code, and we'll be using the [AN4 dataset from CMU](http://www.speech.cs.cmu.edu/databases/an4/) (with processing using `sox`)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YLln3U-IlRzg" + }, + "source": [ + "## Conceptual Overview: What is ASR?\n", + "\n", + "ASR, or **Automatic Speech Recognition**, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text). Our goal is usually to have a model that minimizes the **Word Error Rate (WER)** metric when transcribing speech input. In other words, given some audio file (e.g. a WAV file) containing speech, how do we transform this into the corresponding text with as few errors as possible?\n", + "\n", + "Traditional speech recognition takes a generative approach, modeling the full pipeline of how speech sounds are produced: from a **language model** that encapsulates likely orderings of words (e.g. an n-gram model), to a **pronunciation model** for each word in the vocabulary (e.g. a pronunciation table), to an **acoustic model** that translates the pronunciations to audio waveforms (e.g. a Gaussian Mixture Model), and so on.\n", + "\n", + "Then, if we receive some spoken input, our goal would be to find the most likely sequence of text that would result in the given audio according to our pipeline of models. Overall, with traditional speech recognition, we try to model `Pr(audio|transcript)*Pr(transcript)`, and take the argmax of this over possible transcripts.\n", + "\n", + "Over time, neural nets advanced to the point where each component of the traditional speech recognition model could be replaced by a neural model that had better performance and that had a greater potential for generalization. For example, we could replace an n-gram model with a neural language model, and replace a pronunciation table with a neural pronunciation model, and so on. However, each of these neural models need to be trained individually on different tasks, and errors in any model in the pipeline could throw off the whole prediction.\n", + "\n", + "Thus, we can see the appeal of **end-to-end ASR architectures**--discriminative models that simply take an audio input and give a textual output, and in which all components are trained together towards the same goal. A much easier pipeline to handle!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0S5iZPMSlRzg" + }, + "source": [ + "### End-To-End ASR\n", + "\n", + "With an end-to-end model, we want to directly learn `Pr(transcript|audio)` in order to predict the transcripts from the original audio. Since we are dealing with sequential information--audio data over time that corresponds to a sequence of letters--RNNs are the obvious choice. But now we have a pressing problem to deal with: since our input sequence (number of audio timesteps) is not the same length as our desired output (transcript length), how do we match each time step from the audio data to the correct output characters?\n", + "\n", + "Earlier speech recognition approaches relied on **temporally-aligned data**, in which each segment of time in an audio file was matched up to a corresponding speech sound such as a phoneme or word. However, if we would like to have the flexibility to predict letter-by-letter to prevent OOV (out of vocabulary) issues, then each time step in the data would have to be labeled with the letter sound that the speaker is making at that point in the audio file. With that information, it seems like we should simply be able to try to predict the correct letter for each time step and then collapse the repeated letters (e.g. the prediction output `LLLAAAAPPTOOOPPPP` would become `LAPTOP`). It turns out that this idea has some problems: not only does alignment make the dataset incredibly labor-intensive to label, but also, what do we do with words like \"book\" that contain consecutive repeated letters? Simply squashing repeated letters together would not work in that case!\n", + "\n", + "![Alignment example](https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/asr/notebooks/images/alignment_example.png)\n", + "\n", + "Modern end-to-end approaches get around this using methods that don't require manual alignment at all, so that the input-output pairs are really just the raw audio and the transcript--no extra data or labeling required. Let's briefly go over two popular approaches that allow us to do this, Connectionist Temporal Classification (CTC) and sequence-to-sequence models with attention.\n", + "\n", + "#### Connectionist Temporal Classification (CTC)\n", + "\n", + "In normal speech recognition prediction output, we would expect to have characters such as the letters from A through Z, numbers 0 through 9, spaces (\"\\_\"), and so on. CTC introduces a new intermediate output token called the **blank token** (\"-\") that is useful for getting around the alignment issue.\n", + "\n", + "With CTC, we still predict one token per time segment of speech, but we use the blank token to figure out where we can and can't collapse the predictions. The appearance of a blank token helps separate repeating letters that should not be collapsed. For instance, with an audio snippet segmented into `T=11` time steps, we could get predictions that look like `BOO-OOO--KK`, which would then collapse to `\"BO-O-K\"`, and then we would remove the blank tokens to get our final output, `BOOK`.\n", + "\n", + "Now, we can predict one output token per time step, then collapse and clean to get sensible output without any fear of ambiguity from repeating letters! A simple way of getting predictions like this would be to apply a bidirectional RNN to the audio input, apply softmax over each time step's output, and then take the token with the highest probability. The method of always taking the best token at each time step is called **greedy decoding, or max decoding**.\n", + "\n", + "To calculate our loss for backprop, we would like to know the log probability of the model producing the correct transcript, `log(Pr(transcript|audio))`. We can get the log probability of a single intermediate output sequence (e.g. `BOO-OOO--KK`) by summing over the log probabilities we get from each token's softmax value, but note that the resulting sum is different from the log probability of the transcript itself (`BOOK`). This is because there are multiple possible output sequences of the same length that can be collapsed to get the same transcript (e.g. `BBO--OO-KKK` also results in `BOOK`), and so we need to **marginalize over every valid sequence of length `T` that collapses to the transcript**.\n", + "\n", + "Therefore, to get our transcript's log probability given our audio input, we must sum the log probabilities of every sequence of length `T` that collapses to the transcript (e.g. `log(Pr(output: \"BOOK\"|audio)) = log(Pr(BOO-OOO--KK|audio)) + log(Pr(BBO--OO-KKK|audio)) + ...`). In practice, we can use a dynamic programming approach to calculate this, accumulating our log probabilities over different \"paths\" through the softmax outputs at each time step.\n", + "\n", + "If you would like a more in-depth explanation of how CTC works, or how we can improve our results by using a modified beam search algorithm, feel free to check out the Further Reading section at the end of this notebook for more resources.\n", + "\n", + "#### Sequence-to-Sequence with Attention\n", + "\n", + "One problem with CTC is that predictions at different time steps are conditionally independent, which is an issue because the words in a continuous utterance tend to be related to each other in some sensible way. With this conditional independence assumption, we can't learn a language model that can represent such dependencies, though we can add a language model on top of the CTC output to mitigate this to some degree.\n", + "\n", + "A popular alternative is to use a sequence-to-sequence model with attention. A typical seq2seq model for ASR consists of some sort of **bidirectional RNN encoder** that consumes the audio sequence timestep-by-timestep, and where the outputs are then passed to an **attention-based decoder**. Each prediction from the decoder is based on attending to some parts of the entire encoded input, as well as the previously outputted tokens.\n", + "\n", + "The outputs of the decoder can be anything from word pieces to phonemes to letters, and since predictions are not directly tied to time steps of the input, we can just continue producing tokens one-by-one until an end token is given (or we reach a specified max output length). This way, we do not need to deal with audio alignment, and our predicted transcript is just the sequence of outputs given by our decoder.\n", + "\n", + "Now that we have an idea of what some popular end-to-end ASR models look like, let's take a look at the audio data we'll be working with for our example." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "38aYTCTIlRzh" + }, + "source": [ + "## Taking a Look at Our Data (AN4)\n", + "\n", + "The AN4 dataset, also known as the Alphanumeric dataset, was collected and published by Carnegie Mellon University. It consists of recordings of people spelling out addresses, names, telephone numbers, etc., one letter or number at a time, as well as their corresponding transcripts. We choose to use AN4 for this tutorial because it is relatively small, with 948 training and 130 test utterances, and so it trains quickly.\n", + "\n", + "Before we get started, let's download and prepare the dataset. The utterances are available as `.sph` files, so we will need to convert them to `.wav` for later processing. Please make sure you have [Sox](http://sox.sourceforge.net/) installed for this step (see the \"Downloads\" section of the main page)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gAhsmi6HlRzh" + }, + "outputs": [], + "source": [ + "# This is where the an4/ directory will be placed.\n", + "# Change this if you don't want the data to be extracted in the current directory.\n", + "data_dir = '.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Yb4fuUvWlRzk", + "scrolled": true + }, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import subprocess\n", + "import tarfile\n", + "import wget\n", + "\n", + "# Download the dataset. This will take a few moments...\n", + "print(\"******\")\n", + "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", + " an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n", + " an4_path = wget.download(an4_url, data_dir)\n", + " print(f\"Dataset downloaded at: {an4_path}\")\n", + "else:\n", + " print(\"Tarfile already exists.\")\n", + " an4_path = data_dir + '/an4_sphere.tar.gz'\n", + "\n", + "if not os.path.exists(data_dir + '/an4/'):\n", + " # Untar and convert .sph to .wav (using sox)\n", + " tar = tarfile.open(an4_path)\n", + " tar.extractall(path=data_dir)\n", + "\n", + " print(\"Converting .sph to .wav...\")\n", + " sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)\n", + " for sph_path in sph_list:\n", + " wav_path = sph_path[:-4] + '.wav'\n", + " cmd = [\"sox\", sph_path, wav_path]\n", + " subprocess.run(cmd)\n", + "print(\"Finished conversion.\\n******\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "m_LFeM0elRzm" + }, + "source": [ + "You should now have a folder called `an4` that contains `etc/an4_train.transcription`, `etc/an4_test.transcription`, audio files in `wav/an4_clstk` and `wav/an4test_clstk`, along with some other files we will not need.\n", + "\n", + "Now we can load and take a look at the data. As an example, file `cen2-mgah-b.wav` is a 2.6 second-long audio recording of a man saying the letters \"G L E N N\" one-by-one (feel free to check this out by listening to `./an4/wav/an4_clstk/mgah/cen2-mgah-b.wav`). In an ASR task, the WAV file would be our input, and \"G L E N N\" would be our desired output.\n", + "\n", + "Let's plot the waveform, which is simply a line plot of the sequence of values that we read from the file. This is a format of viewing audio that you are likely to be familiar with seeing in many audio editors and visualizers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MqIAKkqelRzm" + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import librosa\n", + "import librosa.display\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "\n", + "# Plot our example audio file's waveform\n", + "example_file = data_dir + '/an4/wav/an4_clstk/mgah/cen2-mgah-b.wav'\n", + "audio, sample_rate = librosa.load(example_file)\n", + "\n", + "plt.rcParams['figure.figsize'] = (15,7)\n", + "plt.title('Waveform of Audio Example')\n", + "plt.ylabel('Amplitude')\n", + "\n", + "_ = librosa.display.waveplot(audio)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Gg6RR_yolRzo" + }, + "source": [ + "We can see the activity in the waveform that corresponds to each letter in the audio, as our speaker here enunciates quite clearly!\n", + "You can kind of tell that each spoken letter has a different \"shape,\" and it's interesting to note that last two blobs look relatively similar, which is expected because they are both the letter \"N.\"\n", + "\n", + "### Spectrograms and Mel Spectrograms\n", + "\n", + "However, since audio information is more useful in the context of frequencies of sound over time, we can get a better representation than this raw sequence of 57,330 values.\n", + "We can apply a [Fourier Transform](https://en.wikipedia.org/wiki/Fourier_transform) on our audio signal to get something more useful: a **spectrogram**, which is a representation of the energy levels (i.e. amplitude, or \"loudness\") of each frequency (i.e. pitch) of the signal over the duration of the file.\n", + "A spectrogram (which can be viewed as a heat map) is a good way of seeing how the *strengths of various frequencies in the audio vary over time*, and is obtained by breaking up the signal into smaller, usually overlapping chunks and performing a Short-Time Fourier Transform (STFT) on each.\n", + "\n", + "Let's examine what the spectrogram of our sample looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oCFneEs1lRzp" + }, + "outputs": [], + "source": [ + "# Get spectrogram using Librosa's Short-Time Fourier Transform (stft)\n", + "spec = np.abs(librosa.stft(audio))\n", + "spec_db = librosa.amplitude_to_db(spec, ref=np.max) # Decibels\n", + "\n", + "# Use log scale to view frequencies\n", + "librosa.display.specshow(spec_db, y_axis='log', x_axis='time')\n", + "plt.colorbar()\n", + "plt.title('Audio Spectrogram');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9OPc4tcalRzs" + }, + "source": [ + "Again, we are able to see each letter being pronounced, and that the last two blobs that correspond to the \"N\"s are pretty similar-looking. But how do we interpret these shapes and colors? Just as in the waveform plot before, we see time passing on the x-axis (all 2.6s of audio). But now, the y-axis represents different frequencies (on a log scale), and *the color on the plot shows the strength of a frequency at a particular point in time*.\n", + "\n", + "We're still not done yet, as we can make one more potentially useful tweak: using the **Mel Spectrogram** instead of the normal spectrogram. This is simply a change in the frequency scale that we use from linear (or logarithmic) to the mel scale, which is \"a perceptual scale of pitches judged by listeners to be equal in distance from one another\" (from [Wikipedia](https://en.wikipedia.org/wiki/Mel_scale)).\n", + "\n", + "In other words, it's a transformation of the frequencies to be more aligned to what humans perceive; a change of +1000Hz from 2000Hz->3000Hz sounds like a larger difference to us than 9000Hz->10000Hz does, so the mel scale normalizes this such that equal distances sound like equal differences to the human ear. Intuitively, we use the mel spectrogram because in this case we are processing and transcribing human speech, such that transforming the scale to better match what we hear is a useful procedure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7yQXVn-TlRzt" + }, + "outputs": [], + "source": [ + "# Plot the mel spectrogram of our sample\n", + "mel_spec = librosa.feature.melspectrogram(audio, sr=sample_rate)\n", + "mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)\n", + "\n", + "librosa.display.specshow(\n", + " mel_spec_db, x_axis='time', y_axis='mel')\n", + "plt.colorbar()\n", + "plt.title('Mel Spectrogram');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RSCyVizDlRz1" + }, + "source": [ + "## Convolutional ASR Models\n", + "\n", + "Let's take a look at the model that we will be building, and how we specify its parameters.\n", + "\n", + "### The Jasper Model\n", + "\n", + "We will be training a small [Jasper (Just Another SPeech Recognizer) model](https://arxiv.org/abs/1904.03288) from scratch (e.g. initialized randomly). \n", + "In brief, Jasper architectures consist of a repeated block structure that utilizes 1D convolutions.\n", + "In a Jasper_KxR model, `R` sub-blocks (consisting of a 1D convolution, batch norm, ReLU, and dropout) are grouped into a single block, which is then repeated `K` times.\n", + "We also have a one extra block at the beginning and a few more at the end that are invariant of `K` and `R`, and we use CTC loss.\n", + "\n", + "### The QuartzNet Model\n", + "\n", + "The QuartzNet is better variant of Jasper with a key difference that it uses time-channel separable 1D convolutions. This allows it to dramatically reduce number of weights while keeping similar accuracy.\n", + "\n", + "A Jasper/QuartzNet models look like this (QuartzNet model is pictured):\n", + "\n", + "![QuartzNet with CTC](https://developer.nvidia.com/blog/wp-content/uploads/2020/05/quartznet-model-architecture-1-625x742.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gEpNci7slRzw" + }, + "source": [ + "# Using NeMo for Automatic Speech Recognition\n", + "\n", + "Now that we have an idea of what ASR is and how the audio data looks like, we can start using NeMo to do some ASR!\n", + "\n", + "We'll be using the **Neural Modules (NeMo) toolkit** for this part, so if you haven't already, you should download and install NeMo and its dependencies. To do so, just follow the directions on the [GitHub page](https://github.com/NVIDIA/NeMo), or in the [documentation](https://docs.nvidia.com/deeplearning/nemo/developer_guide/en/candidate/).\n", + "\n", + "NeMo lets us easily hook together the components (modules) of our model, such as the data layer, intermediate layers, and various losses, without worrying too much about implementation details of individual parts or connections between modules. NeMo also comes with complete models which only require your data and hyperparameters for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4_W0lhaQlRzx" + }, + "outputs": [], + "source": [ + "# NeMo's \"core\" package\n", + "import nemo\n", + "# NeMo's ASR collection - this collections contains complete ASR models and\n", + "# building blocks (modules) for ASR\n", + "import nemo.collections.asr as nemo_asr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "v_W8EbYktZE3" + }, + "source": [ + "## Using an Out-of-the-Box Model\n", + "\n", + "NeMo's ASR collection comes with many building blocks and even complete models that we can use for training and evaluation. Moreover, several models come with pre-trained weights. Let's instantiate a complete QuartzNet15x5 model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "KFZZpYult96G" + }, + "outputs": [], + "source": [ + "# This line will download pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud and instantiate it for you\n", + "quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"QuartzNet15x5Base-En\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KucxoFJhum0i" + }, + "source": [ + "Next, we'll simply add paths to files we want to transcribe into the list and pass it to our model. Note that it will work for relatively short (<25 seconds) files. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3QCpR_93u1hp" + }, + "outputs": [], + "source": [ + "files = ['./an4/wav/an4_clstk/mgah/cen2-mgah-b.wav']\n", + "for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):\n", + " print(f\"Audio in {fname} was recognized as: {transcription}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ppUm_kuavm_f" + }, + "source": [ + "That was easy! But there are plenty of scenarios where you would want to fine-tune the model on your own data or even train from scratch. For example, this out-of-the box model will obviously not work for Spanish and would likely perform poorly for telephone audio. So if you have collected your own data, you certainly should attempt to fine-tune or train on it!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ABUDaC5Js7AW" + }, + "source": [ + "## Training from Scratch\n", + "\n", + "To train from scratch, you need to prepare your training data in the right format and specify your models architecture." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RdNyw1b_zgtm" + }, + "source": [ + "### Creating Data Manifests\n", + "\n", + "The first thing we need to do now is to create manifests for our training and evaluation data, which will contain the metadata of our audio files. NeMo data sets take in a standardized manifest format where each line corresponds to one sample of audio, such that the number of lines in a manifest is equal to the number of samples that are represented by that manifest. A line must contain the path to an audio file, the corresponding transcript (or path to a transcript file), and the duration of the audio sample.\n", + "\n", + "Here's an example of what one line in a NeMo-compatible manifest might look like:\n", + "```\n", + "{\"audio_filepath\": \"path/to/audio.wav\", \"duration\": 3.45, \"text\": \"this is a nemo tutorial\"}\n", + "```\n", + "\n", + "We can build our training and evaluation manifests using `an4/etc/an4_train.transcription` and `an4/etc/an4_test.transcription`, which have lines containing transcripts and their corresponding audio file IDs:\n", + "```\n", + "...\n", + " P I T T S B U R G H (cen5-fash-b)\n", + " TWO SIX EIGHT FOUR FOUR ONE EIGHT (cen7-fash-b)\n", + "...\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lVB1sG1GlRzz" + }, + "outputs": [], + "source": [ + "# --- Building Manifest Files --- #\n", + "import json\n", + "\n", + "# Function to build a manifest\n", + "def build_manifest(transcripts_path, manifest_path, wav_path):\n", + " with open(transcripts_path, 'r') as fin:\n", + " with open(manifest_path, 'w') as fout:\n", + " for line in fin:\n", + " # Lines look like this:\n", + " # transcript (fileID)\n", + " transcript = line[: line.find('(')-1].lower()\n", + " transcript = transcript.replace('', '').replace('', '')\n", + " transcript = transcript.strip()\n", + "\n", + " file_id = line[line.find('(')+1 : -2] # e.g. \"cen4-fash-b\"\n", + " audio_path = os.path.join(\n", + " data_dir, wav_path,\n", + " file_id[file_id.find('-')+1 : file_id.rfind('-')],\n", + " file_id + '.wav')\n", + "\n", + " duration = librosa.core.get_duration(filename=audio_path)\n", + "\n", + " # Write the metadata to the manifest\n", + " metadata = {\n", + " \"audio_filepath\": audio_path,\n", + " \"duration\": duration,\n", + " \"text\": transcript\n", + " }\n", + " json.dump(metadata, fout)\n", + " fout.write('\\n')\n", + " \n", + "# Building Manifests\n", + "print(\"******\")\n", + "train_transcripts = data_dir + '/an4/etc/an4_train.transcription'\n", + "train_manifest = data_dir + '/an4/train_manifest.json'\n", + "if not os.path.isfile(train_manifest):\n", + " build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')\n", + " print(\"Training manifest created.\")\n", + "\n", + "test_transcripts = data_dir + '/an4/etc/an4_test.transcription'\n", + "test_manifest = data_dir + '/an4/test_manifest.json'\n", + "if not os.path.isfile(test_manifest):\n", + " build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n", + " print(\"Test manifest created.\")\n", + "print(\"***Done***\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "W2fShQzRzo-M" + }, + "source": [ + "### Specifying Our Model with a YAML Config File\n", + "\n", + "For this tutorial, we'll build a *Jasper_4x1 model*, with `K=4` blocks of single (`R=1`) sub-blocks and a *greedy CTC decoder*, using the configuration found in `./configs/config.yaml`.\n", + "\n", + "If we open up this config file, we find model section which describes architecture of our model. A model contains an entry labeled `encoder`, with a field called `jasper` that contains a list with multiple entries. Each of the members in this list specifies one block in our model, and looks something like this:\n", + "```\n", + "- filters: 128\n", + " repeat: 1\n", + " kernel: [11]\n", + " stride: [2]\n", + " dilation: [1]\n", + " dropout: 0.2\n", + " residual: false\n", + " separable: true\n", + " se: true\n", + " se_context_size: -1\n", + "```\n", + "The first member of the list corresponds to the first block in the Jasper architecture diagram, which appears regardless of `K` and `R`.\n", + "Next, we have four entries that correspond to the `K=4` blocks, and each has `repeat: 1` since we are using `R=1`.\n", + "These are followed by two more entries for the blocks that appear at the end of our Jasper model before the CTC loss.\n", + "\n", + "There are also some entries at the top of the file that specify how we will handle training (`train_ds`) and validation (`validation_ds`) data.\n", + "\n", + "Using a YAML config such as this is helpful for getting a quick and human-readable overview of what your architecture looks like, and allows you to swap out model and run configurations easily without needing to change your code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PXVKBniMlRz5" + }, + "outputs": [], + "source": [ + "# --- Config Information ---#\n", + "from ruamel.yaml import YAML\n", + "config_path = './configs/config.yaml'\n", + "\n", + "yaml = YAML(typ='safe')\n", + "with open(config_path) as f:\n", + " params = yaml.load(f)\n", + "print(params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wUmq3p2Aw_5N" + }, + "source": [ + "### Training with PyTorch Lightning\n", + "\n", + "NeMo models and modules can be used in any PyTorch code where torch.nn.Module is expected.\n", + "\n", + "However, NeMo's models are based on [PytorchLightning's](https://github.com/PyTorchLightning/pytorch-lightning) LightningModule and we recommend you use PytorchLightning for training and fine-tuning as it makes using mixed precision and distributed training very easy. So to start, let's create Trainer instance for training on GPU for 50 epochs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "GUfR6tAK0k2u" + }, + "outputs": [], + "source": [ + "import pytorch_lightning as pl\n", + "trainer = pl.Trainer(gpus=1, max_epochs=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "IEn2RyvgxxvO" + }, + "source": [ + "Next, we instantiate and ASR model based on our ``config.yaml`` file from the previous section.\n", + "Note that this is a stage during which we also tell the model where our training and validation manifests are." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Cbf0fsMK09lk" + }, + "outputs": [], + "source": [ + "from omegaconf import DictConfig\n", + "params['model']['train_ds']['manifest_filepath'] = train_manifest\n", + "params['model']['validation_ds']['manifest_filepath'] = test_manifest\n", + "first_asr_model = nemo_asr.models.EncDecCTCModel(cfg=DictConfig(params['model']), trainer=trainer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hWtzwL5qXTYq" + }, + "source": [ + "With that, we can start training with just one line!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "inRJsnrz1psq" + }, + "outputs": [], + "source": [ + "# Start training!!!\n", + "trainer.fit(first_asr_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jpYXX-GslR0E" + }, + "source": [ + "There we go! We've put together a full training pipeline for the model and trained it for 50 epochs.\n", + "\n", + "### After Training: Monitoring Progress and Changing Hyperparameters\n", + "We can now start Tensorboard to see how training went. Recall that WER stands for Word Error Rate and so the lower it is, the better." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "n_0y3stSXDX_" + }, + "outputs": [], + "source": [ + "%load_ext tensorboard\n", + "%tensorboard --logdir lightning_logs/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Z0h-BME7U8yb" + }, + "source": [ + "We could improve this model by playing with hyperparameters. We can look at the current hyperparameters with the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7kdQbpohXnEd" + }, + "outputs": [], + "source": [ + "print(params['model']['optim'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "sGZzRCvIW8kE" + }, + "source": [ + "Let's say we wanted to change the learning rate. To do so, we can create a `new_opt` dict and set our desired learning rate, then call `.setup_optimization()` with the new optimization parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "AbigFKUtYgvn" + }, + "outputs": [], + "source": [ + "import copy\n", + "new_opt = copy.deepcopy(params['model']['optim'])\n", + "new_opt['lr'] = 0.001\n", + "first_asr_model.setup_optimization(optim_config=DictConfig(new_opt))\n", + "# And then you can invoke trainer.fit(first_asr_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "D5Kwg8Cz-aaO" + }, + "source": [ + "## Inference\n", + "\n", + "Let's have a quick look at how one could run inference with NeMo's ASR model.\n", + "\n", + "First, ``EncDecCTCModel`` and its subclasses contain a handy ``transcribe`` method which can be used to simply obtain audio files' transcriptions. It also has batch_size argument to improve performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3FT0klSV268p" + }, + "outputs": [], + "source": [ + "print(first_asr_model.transcribe(paths2audio_files=['./an4/wav/an4_clstk/mgah/cen2-mgah-b.wav',\n", + " './an4/wav/an4_clstk/fmjd/cen7-fmjd-b.wav',\n", + " './an4/wav/an4_clstk/fmjd/cen8-fmjd-b.wav',\n", + " './an4/wav/an4_clstk/fkai/cen8-fkai-b.wav'],\n", + " batch_size=4))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6FiCfLX0D7py" + }, + "source": [ + "Below is an example of a simple inference loop in pure PyTorch. It also shows how one can compute Word Error Rate (WER) metric between predictions and references." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7mP4r1Gx_Ilt" + }, + "outputs": [], + "source": [ + "# Bigger batch-size = bigger throughput\n", + "params['model']['validation_ds']['batch_size'] = 16\n", + "\n", + "# Setup the test data loader and make sure the model is on GPU\n", + "first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])\n", + "first_asr_model.cuda()\n", + "\n", + "# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.\n", + "# WER is computed as numerator/denominator.\n", + "# We'll gather all the test batches' numerators and denominators.\n", + "wer_nums = []\n", + "wer_denoms = []\n", + "\n", + "# Loop over all test batches.\n", + "# Iterating over the model's `test_dataloader` will give us:\n", + "# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)\n", + "# See the AudioToCharDataset for more details.\n", + "for test_batch in first_asr_model.test_dataloader():\n", + " test_batch = [x.cuda() for x in test_batch]\n", + " targets = test_batch[2]\n", + " targets_lengths = test_batch[3] \n", + " log_probs, encoded_len, greedy_predictions = first_asr_model(\n", + " input_signal=test_batch[0], input_signal_length=test_batch[1]\n", + " )\n", + " # Notice the model has a helper object to compute WER\n", + " wer_num, wer_denom = first_asr_model._wer(greedy_predictions, targets, targets_lengths)\n", + " wer_nums.append(wer_num.detach().cpu().numpy())\n", + " wer_denoms.append(wer_denom.detach().cpu().numpy())\n", + "\n", + "# We need to sum all numerators and denominators first. Then divide.\n", + "print(f\"WER = {sum(wer_nums)/sum(wer_denoms)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0kM9kBNOCptf" + }, + "source": [ + "This WER is not particularly impressive and could be significantly improved. You could train longer (try 100 epochs) to get a better number. Check out the next section on how to improve it further." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RBcJtg5ulR0H" + }, + "source": [ + "## Model Improvements\n", + "\n", + "You already have all you need to create your own ASR model in NeMo, but there are a few more tricks that you can employ if you so desire. In this section, we'll briefly cover a few possibilities for improving an ASR model.\n", + "\n", + "### Data Augmentation\n", + "\n", + "There exist several ASR data augmentation methods that can increase the size of our training set.\n", + "\n", + "For example, we can perform augmentation on the spectrograms by zeroing out specific frequency segments (\"frequency masking\") or time segments (\"time masking\") as described by [SpecAugment](https://arxiv.org/abs/1904.08779), or zero out rectangles on the spectrogram as in [Cutout](https://arxiv.org/pdf/1708.04552.pdf). In NeMo, we can do all three of these by simply adding in a `SpectrogramAugmentation` neural module. (As of now, it does not perform the time warping from the SpecAugment paper.)\n", + "\n", + "Our toy model does not do spectrogram augmentation. But the real one we got from cloud does:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9glGogaPlR0H" + }, + "outputs": [], + "source": [ + "print(quartznet._cfg['spec_augment'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LdwdcA_a640R" + }, + "source": [ + "If you want to enable SpecAugment in your model, make sure your .yaml config file contains 'model/spec_augment' section which looks like the one above." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2f142kIQc1Z2" + }, + "source": [ + "### Transfer learning\n", + "\n", + "Transfer learning is an important machine learning technique that uses a model’s knowledge of one task to make it perform better on another. Fine-tuning is one of the techniques to perform transfer learning. It is an essential part of the recipe for many state-of-the-art results where a base model is first pretrained on a task with abundant training data and then fine-tuned on different tasks of interest where the training data is less abundant or even scarce.\n", + "\n", + "In ASR you might want to do fine-tuning in multiple scenarios, for example, when you want to improve your model's performance on a particular domain (medical, financial, etc.) or on accented speech. You can even transfer learn from one language to another! Check out [this paper](https://arxiv.org/abs/2005.04290) for examples.\n", + "\n", + "Transfer learning with NeMo is simple. Let's demonstrate how the model we got from the cloud could be fine-tuned on AN4 data. (NOTE: this is a toy example). And, while we are at it, we will change model's vocabulary, just to demonstrate how it's done." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "hl320dsydWX0" + }, + "outputs": [], + "source": [ + "# Check what kind of vocabulary/alphabet the model has right now\n", + "print(quartznet.decoder.vocabulary)\n", + "\n", + "# Let's add \"!\" symbol there. Note that you can (and should!) change the vocabulary\n", + "# entirely when fine-tuning using a different language.\n", + "quartznet.change_vocabulary(\n", + " new_vocabulary=[\n", + " ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',\n", + " 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', \"'\", \"!\"\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "M7lvmiMSd3Aw" + }, + "source": [ + "After this, our decoder has completely changed, but our encoder (which is where most of the weights are) remained intact. Let's fine tune-this model for 2 epochs on AN4 dataset. We will also use the smaller learning rate from ``new_opt` (see the \"After Training\" section)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_PZJIso-eDl-" + }, + "outputs": [], + "source": [ + "# Use the smaller learning rate we set before\n", + "quartznet.setup_optimization(optim_config=DictConfig(new_opt))\n", + "\n", + "# Point to the data we'll use for fine-tuning as the training set\n", + "quartznet.setup_training_data(train_data_config=params['model']['train_ds'])\n", + "\n", + "# Point to the new validation data for fine-tuning\n", + "quartznet.setup_validation_data(val_data_config=params['model']['validation_ds'])\n", + "\n", + "# And now we can create a PyTorch Lightning trainer and call `fit` again.\n", + "trainer = pl.Trainer(gpus=[1], max_epochs=2)\n", + "trainer.fit(quartznet)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "VURa1NavlR0U" + }, + "source": [ + "### Fast Training\n", + "\n", + "Last but not least, we could simply speed up training our model! If you have the resources, you can speed up training by splitting the workload across multiple GPUs. Otherwise (or in addition), there's always mixed precision training, which allows you to increase your batch size.\n", + "\n", + "You can use [PyTorch Lightning's Trainer object](https://pytorch-lightning.readthedocs.io/en/latest/trainer.html) to handle mixed-precision and distributed training for you. Below are some examples of flags you would pass to the `Trainer` to use these features:\n", + "\n", + "```python\n", + "# Mixed precision:\n", + "trainer = pl.Trainer(amp_level='O1', precision=16)\n", + "\n", + "# Trainer with a distributed backend:\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", + "\n", + "# Of course, you can combine these flags as well.\n", + "```\n", + "\n", + "Finally, have a look at [example scripts in NeMo repository](https://github.com/NVIDIA/NeMo/blob/candidate/examples/asr/speech_to_text.py) which can handle mixed precision and distibuted training using command-line arguments." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wteGqroafWg1" + }, + "source": [ + "## Under the Hood\n", + "\n", + "NeMo is open-source and we do all our model development in the open, so you can inspect our code if you wish.\n", + "\n", + "In particular, ``nemo_asr.model.EncDecCTCModel`` is an encoder-decoder model which is constructed using several ``Neural Modules`` taken from ``nemo_asr.modules.`` Here is what its forward pass looks like:\n", + "```python\n", + "def forward(self, input_signal, input_signal_length):\n", + " processed_signal, processed_signal_len = self.preprocessor(\n", + " input_signal=input_signal, length=input_signal_length,\n", + " )\n", + " # Spec augment is not applied during evaluation/testing\n", + " if self.spec_augmentation is not None and self.training:\n", + " processed_signal = self.spec_augmentation(input_spec=processed_signal)\n", + " encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_len)\n", + " log_probs = self.decoder(encoder_output=encoded)\n", + " greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)\n", + " return log_probs, encoded_len, greedy_predictions\n", + "```\n", + "Here:\n", + "\n", + "* ``self.preprocessor`` is an instance of ``nemo_asr.modules.AudioToMelSpectrogramPreprocessor``, which is a neural module that takes audio signal and converts it into a Mel-Spectrogram\n", + "* ``self.spec_augmentation`` - is a neural module of type ```nemo_asr.modules.SpectrogramAugmentation``, which implements data augmentation. \n", + "* ``self.encoder`` - is a convolutional Jasper/QuartzNet-like encoder of type ``nemo_asr.modules.ConvASREncoder``\n", + "* ``self.decoder`` - is a ``nemo_asr.modules.ConvASRDecoder`` which simply projects into the target alphabet (vocabulary).\n", + "\n", + "Also, ``EncDecCTCModel`` uses the audio dataset class ``nemo_asr.data.AudioToCharDataset`` and CTC loss implemented in ``nemo_asr.losses.CTCLoss``.\n", + "\n", + "You can use these and other neural modules (or create new ones yourself!) to construct new ASR models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "smzlvbhelR0U" + }, + "source": [ + "# Further Reading/Watching:\n", + "\n", + "That's all for now! If you'd like to learn more about the topics covered in this tutorial, here are some resources that may interest you:\n", + "- [Stanford Lecture on ASR](https://www.youtube.com/watch?v=3MjIkWxXigM)\n", + "- [\"An Intuitive Explanation of Connectionist Temporal Classification\"](https://towardsdatascience.com/intuitively-understanding-connectionist-temporal-classification-3797e43a86c)\n", + "- [Explanation of CTC with Prefix Beam Search](https://medium.com/corti-ai/ctc-networks-and-language-models-prefix-beam-search-explained-c11d1ee23306)\n", + "- [Listen Attend and Spell Paper (seq2seq ASR model)](https://arxiv.org/abs/1508.01211)\n", + "- [Explanation of the mel spectrogram in more depth](https://towardsdatascience.com/getting-to-know-the-mel-spectrogram-31bca3e2d9d0)\n", + "- [Jasper Paper](https://arxiv.org/abs/1904.03288)\n", + "- [QuartzNet paper](https://arxiv.org/abs/1910.10261)\n", + "- [SpecAugment Paper](https://arxiv.org/abs/1904.08779)\n", + "- [Explanation and visualization of SpecAugment](https://towardsdatascience.com/state-of-the-art-audio-data-augmentation-with-google-brains-specaugment-and-pytorch-d3d1a3ce291e)\n", + "- [Cutout Paper](https://arxiv.org/pdf/1708.04552.pdf)\n", + "- [Transfer Learning Blogpost](https://developer.nvidia.com/blog/jump-start-training-for-speech-recognition-models-with-nemo/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "V3ERGX86lR0V" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "ASR_with_NeMo.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/tutorials/asr/02_Speech_Commands.ipynb b/tutorials/asr/02_Speech_Commands.ipynb index ef105a81185b..fe60bba24586 100644 --- a/tutorials/asr/02_Speech_Commands.ipynb +++ b/tutorials/asr/02_Speech_Commands.ipynb @@ -1,1757 +1,1770 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "02_Speech_Commands.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "R12Yn6W1dt9t", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell.\n", - "\n", - "## Install dependencies\n", - "!pip install wget\n", - "!apt-get install sox libsndfile1 ffmpeg\n", - "!pip install unidecode\n", - "\n", - "# ## Install NeMo\n", - "!python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@candidate#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html\n", - "\n", - "## Grab the config we'll use in this example\n", - "!mkdir configs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J6ycGIaZfSLE", - "colab_type": "text" - }, - "source": [ - "# Introduction\n", - "\n", - "This Speech Command recognition tutorial is based on the MatchboxNet model from the paper [\"MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition\"](https://arxiv.org/abs/2004.08531). MatchboxNet is a modified form of the QuartzNet architecture from the paper \"[QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/pdf/1910.10261.pdf)\" with a modified decoder head to suit classification tasks.\n", - "\n", - "The notebook will follow the steps below:\n", - "\n", - " - Dataset preparation: Preparing Google Speech Commands dataset\n", - "\n", - " - Audio preprocessing (feature extraction): signal normalization, windowing, (log) spectrogram (or mel scale spectrogram, or MFCC)\n", - "\n", - " - Data augmentation using SpecAugment \"[SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779)\" to increase the number of data samples.\n", - " \n", - " - Develop a small Neural classification model that can be trained efficiently.\n", - " \n", - " - Model training on the Google Speech Commands dataset in NeMo.\n", - " \n", - " - Evaluation of error cases of the model by audibly hearing the samples" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "I62_LJzc-p2b", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Some utility imports\n", - "import os\n", - "from omegaconf import OmegaConf" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "K_M8wpkwd7d7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# This is where the Google Speech Commands directory will be placed.\n", - "# Change this if you don't want the data to be extracted in the current directory.\n", - "# Select the version of the dataset required as well (can be 1 or 2)\n", - "DATASET_VER = 1\n", - "data_dir = './google_dataset_v{0}/'.format(DATASET_VER)\n", - "\n", - "if DATASET_VER == 1:\n", - " MODEL_CONFIG = \"matchboxnet_3x1x64_v1.yaml\"\n", - "else:\n", - " MODEL_CONFIG = \"matchboxnet_3x1x64_v2.yaml\"\n", - "\n", - "if not os.path.exists(f\"configs/{MODEL_CONFIG}\"):\n", - " !wget -P configs/ \"https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/asr/conf/{MODEL_CONFIG}\"" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvfwv9Hjf1Uv", - "colab_type": "text" - }, - "source": [ - "# Data Preparation\n", - "\n", - "We will be using the open-source Google Speech Commands Dataset (we will use V1 of the dataset for the tutorial but require minor changes to support the V2 dataset). These scripts below will download the dataset and convert it to a format suitable for use with NeMo." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6VL10OXTf8ts", - "colab_type": "text" - }, - "source": [ - "## Download the dataset\n", - "\n", - "The dataset must be prepared using the scripts provided under the `{NeMo root directory}/scripts` sub-directory. \n", - "\n", - "Run the following command below to download the data preparation script and execute it.\n", - "\n", - "**NOTE**: You should have at least 4GB of disk space available if you’ve used --data_version=1; and at least 6GB if you used --data_version=2. Also, it will take some time to download and process, so go grab a coffee.\n", - "\n", - "**NOTE**: You may additionally pass a `--rebalance` flag at the end of the `process_speech_commands_data.py` script to rebalance the class samples in the manifest." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oqKe6_uLfzKU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "if not os.path.exists(\"process_speech_commands_data.py\"):\n", - " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/scripts/process_speech_commands_data.py" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TTsxp0nZ1zqo", - "colab_type": "text" - }, - "source": [ - "### Preparing the manifest file\n", - "\n", - "The manifest file is a simple file that has the full path to the audio file, the duration of the audio file, and the label that is assigned to that audio file. \n", - "\n", - "This notebook is only a demonstration, and therefore we will use the `--skip_duration` flag to speed up construction of the manifest file.\n", - "\n", - "**NOTE: When replicating the results of the paper, do not use this flag and prepare the manifest file with correct durations.**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "cWUtDpzKgop9", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!mkdir {data_dir}\n", - "!python process_speech_commands_data.py --data_root={data_dir} --data_version={DATASET_VER} --skip_duration --log\n", - "print(\"Dataset ready !\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eVsPFxJtg30p", - "colab_type": "text" - }, - "source": [ - "## Prepare the path to manifest files" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ytTFGVe0g9wk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dataset_path = 'google_speech_recognition_v{0}'.format(DATASET_VER)\n", - "dataset_basedir = os.path.join(data_dir, dataset_path)\n", - "\n", - "train_dataset = os.path.join(dataset_basedir, 'train_manifest.json')\n", - "val_dataset = os.path.join(dataset_basedir, 'validation_manifest.json')\n", - "test_dataset = os.path.join(dataset_basedir, 'validation_manifest.json')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s0SZy9SEhOBf", - "colab_type": "text" - }, - "source": [ - "## Read a few rows of the manifest file \n", - "\n", - "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", - "\n", - "1) `audio_filepath`: Refers to the path to the raw audio file
\n", - "2) `command`: The class label (or speech command) of this sample
\n", - "3) `duration`: The length of the audio file, in seconds." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HYBidCMIhKQV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!head -n 5 {train_dataset}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r-pyUBedh8f4", - "colab_type": "text" - }, - "source": [ - "# Training - Preparation\n", - "\n", - "We will be training a MatchboxNet model from the paper [\"MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition\"](https://arxiv.org/abs/2004.08531). The benefit of MatchboxNet over JASPER models is that they use 1D Time-Channel Separable Convolutions, which greatly reduce the number of parameters required to obtain good model accuracy.\n", - "\n", - "MatchboxNet models generally follow the model definition pattern QuartzNet-[BxRXC], where B is the number of blocks, R is the number of convolutional sub-blocks, and C is the number of channels in these blocks. Each sub-block contains a 1-D masked convolution, batch normalization, ReLU, and dropout.\n", - "\n", - "An image of QuartzNet, the base configuration of MatchboxNet models, is provided below.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T0sV4riijHJF", - "colab_type": "text" - }, - "source": [ - "

\n", - " \n", - "

" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ieAPOM9thTN2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# NeMo's \"core\" package\n", - "import nemo\n", - "# NeMo's ASR collection - this collections contains complete ASR models and\n", - "# building blocks (modules) for ASR\n", - "import nemo.collections.asr as nemo_asr" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ss9gLcDv30jI", - "colab_type": "text" - }, - "source": [ - "## Model Configuration\n", - "The MatchboxNet Model is defined in a config file which declares multiple important sections.\n", - "\n", - "They are:\n", - "\n", - "1) `model`: All arguments that will relate to the Model - preprocessors, encoder, decoder, optimizer and schedulers, datasets and any other related information\n", - "\n", - "2) `trainer`: Any argument to be passed to PyTorch Lightning" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "yoVAs9h1lfci", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# This line will print the entire config of the MatchboxNet model\n", - "config_path = f\"configs/{MODEL_CONFIG}\"\n", - "config = OmegaConf.load(config_path)\n", - "print(config.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "m2lJPR0a3qww", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Preserve some useful parameters\n", - "labels = config.model.labels\n", - "sample_rate = config.sample_rate" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8_pmjeed78rJ", - "colab_type": "text" - }, - "source": [ - "### Setting up the datasets within the config\n", - "\n", - "If you'll notice, there are a few config dictionaries called `train_ds`, `validation_ds` and `test_ds`. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DIe6Qfs18MiQ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(config.model.train_ds.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fb01hl868Uc3", - "colab_type": "text" - }, - "source": [ - "### `???` inside configs\n", - "\n", - "You will often notice that some configs have `???` in place of paths. This is used as a placeholder so that the user can change the value at a later time.\n", - "\n", - "Let's add the paths to the manifests to the config above." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "m181HXev8T97", - "colab_type": "code", - "colab": {} - }, - "source": [ - "config.model.train_ds.manifest_filepath = train_dataset\n", - "config.model.validation_ds.manifest_filepath = val_dataset\n", - "config.model.test_ds.manifest_filepath = test_dataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbXngoCM5IRG", - "colab_type": "text" - }, - "source": [ - "## Building the PyTorch Lightning Trainer\n", - "\n", - "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem!\n", - "\n", - "Lets first instantiate a Trainer object!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bYtvdBlG5afU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import torch\n", - "import pytorch_lightning as pl" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jRN18CdH51nN", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(\"Trainer config - \\n\")\n", - "print(config.trainer.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "gHf6cHvm6H9b", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets modify some trainer configs for this demo\n", - "# Checks if we have GPU available and uses it\n", - "cuda = 1 if torch.cuda.is_available() else 0\n", - "config.trainer.gpus = cuda\n", - "\n", - "# Reduces maximum number of epochs to 5 for quick demonstration\n", - "config.trainer.max_epochs = 5\n", - "\n", - "# Remove distributed training flags\n", - "config.trainer.distributed_backend = None" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "UB9nr7G56G3L", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer = pl.Trainer(**config.trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2wt603Vq6sqX", - "colab_type": "text" - }, - "source": [ - "## Setting up a NeMo Experiment\n", - "\n", - "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it ! " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "TfWJFg7p6Ezf", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from nemo.utils.exp_manager import exp_manager" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SC-QPoW44-p2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Yqi6rkNR7Dph", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# The exp_dir provides a path to the current experiment for easy access\n", - "exp_dir = str(exp_dir)\n", - "exp_dir" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "t0zz-vHH7Uuh", - "colab_type": "text" - }, - "source": [ - "## Building the MatchboxNet Model\n", - "\n", - "MatchboxNet is an ASR model with a classification task - it generates one label for the entire provided audio stream. Therefore we encapsulate it inside the `EncDecClassificationModel` as follows." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FRMrKhyf5vhy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "asr_model = nemo_asr.models.EncDecClassificationModel(cfg=config.model, trainer=trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jA9UND-Q_oyw", - "colab_type": "text" - }, - "source": [ - "# Training a MatchboxNet Model\n", - "\n", - "As MatchboxNet is inherently a PyTorch Lightning Model, it can easily be trained in a single line - `trainer.fit(model)` !" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3ngKcRFqBfIF", - "colab_type": "text" - }, - "source": [ - "### Monitoring training progress\n", - "\n", - "Before we begin training, lets first create a Tensorboard visualization to monitor progress\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Cyfec0PDBsXa", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Load the TensorBoard notebook extension\n", - "%load_ext tensorboard" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "4L5ymu-QBxmz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "%tensorboard --logdir {exp_dir}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZApuELDIKQgC", - "colab_type": "text" - }, - "source": [ - "### Training for 5 epochs\n", - "We see below that the model begins to get modest scores on the validation set after just 5 epochs of training" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9xiUUJlH5KdD", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer.fit(asr_model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Dkds1jSvKgSc", - "colab_type": "text" - }, - "source": [ - "### Evaluation on the Test set\n", - "\n", - "Lets compute the final score on the test set via `trainer.test(model)`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "mULTrhEJ_6wV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer.test(asr_model, ckpt_path=None)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XQntce8cLiUC", - "colab_type": "text" - }, - "source": [ - "# Fast Training\n", - "\n", - "We can dramatically improve the time taken to train this model by using Multi GPU training along with Mixed Precision.\n", - "\n", - "For multi-GPU training, take a look at [the PyTorch Lightning Multi-GPU training section](https://pytorch-lightning.readthedocs.io/en/latest/multi_gpu.html)\n", - "\n", - "For mixed-precision training, take a look at [the PyTorch Lightning Mixed-Precision training section](https://pytorch-lightning.readthedocs.io/en/latest/apex.html)\n", - "\n", - "```python\n", - "# Mixed precision:\n", - "trainer = Trainer(amp_level='O1', precision=16)\n", - "\n", - "# Trainer with a distributed backend:\n", - "trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", - "\n", - "# Of course, you can combine these flags as well.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ifDHkunjM8y6", - "colab_type": "text" - }, - "source": [ - "# Evaluation of incorrectly predicted samples\n", - "\n", - "Given that we have a trained model, which performs reasonably well, let's try to listen to the samples where the model is least confident in its predictions.\n", - "\n", - "For this, we need the support of the librosa library.\n", - "\n", - "**NOTE**: The following code depends on librosa. To install it, run the following code block first." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "s3w3LhHcKuD2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!pip install librosa" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PcJrZ72sNCkM", - "colab_type": "text" - }, - "source": [ - "## Extract the predictions from the model\n", - "\n", - "We want to possess the actual logits of the model instead of just the final evaluation score, so we can define a function to perform the forward step for us without computing the final loss. Instead, we extract the logits per batch of samples provided." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rvxdviYtOFjK", - "colab_type": "text" - }, - "source": [ - "## Accessing the data loaders\n", - "\n", - "We can utilize the `setup_test_data` method in order to instantiate a data loader for the dataset we want to analyze.\n", - "\n", - "For convenience, we can access these instantiated data loaders using the following accessors - `asr_model._train_dl`, `asr_model._validation_dl` and `asr_model._test_dl`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "CB0QZCAmM656", - "colab_type": "code", - "colab": {} - }, - "source": [ - "asr_model.setup_test_data(config.model.test_ds)\n", - "test_dl = asr_model._test_dl" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rA7gXawcPoip", - "colab_type": "text" - }, - "source": [ - "## Partial Test Step\n", - "\n", - "Below we define a utility function to perform most of the test step. For reference, the test step is defined as follows:\n", - "\n", - "```python\n", - " def test_step(self, batch, batch_idx, dataloader_idx=0):\n", - " audio_signal, audio_signal_len, labels, labels_len = batch\n", - " logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)\n", - " loss_value = self.loss(logits=logits, labels=labels)\n", - " correct_counts, total_counts = self._accuracy(logits=logits, labels=labels)\n", - " return {'test_loss': loss_value, 'test_correct_counts': correct_counts, 'test_total_counts': total_counts}\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "sBsDOm5ROpQI", - "colab_type": "code", - "colab": {} - }, - "source": [ - "@torch.no_grad()\n", - "def extract_logits(model, dataloader):\n", - " logits_buffer = []\n", - " label_buffer = []\n", - "\n", - " # Follow the above definition of the test_step\n", - " for batch in dataloader:\n", - " audio_signal, audio_signal_len, labels, labels_len = batch\n", - " logits = model(input_signal=audio_signal, input_signal_length=audio_signal_len)\n", - "\n", - " logits_buffer.append(logits)\n", - " label_buffer.append(labels)\n", - " print(\".\", end='')\n", - " print()\n", - " \n", - " print(\"Finished extracting logits !\")\n", - " logits = torch.cat(logits_buffer, 0)\n", - " labels = torch.cat(label_buffer, 0)\n", - " return logits, labels\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mZSdprUlOuoV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "cpu_model = asr_model.cpu()\n", - "cpu_model.eval()\n", - "logits, labels = extract_logits(cpu_model, test_dl)\n", - "print(\"Logits:\", logits.shape, \"Labels :\", labels.shape)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "9Wd0ukgNXRBz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Compute accuracy - `_accuracy` is a PyTorch Lightning Metric !\n", - "correct_count, total_count = cpu_model._accuracy(logits=logits, labels=labels)\n", - "print(\"Accuracy : \", float(correct_count * 100.) / float(total_count))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NwN9OSqCauSH", - "colab_type": "text" - }, - "source": [ - "## Filtering out incorrect samples\n", - "Let us now filter out the incorrectly labeled samples from the total set of samples in the test set" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "N1YJvsmcZ0uE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import librosa\n", - "import json\n", - "import IPython.display as ipd" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jZAT9yGAayvR", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# First lets create a utility class to remap the integer class labels to actual string label\n", - "class ReverseMapLabel:\n", - " def __init__(self, data_loader):\n", - " self.label2id = dict(data_loader.dataset.label2id)\n", - " self.id2label = dict(data_loader.dataset.id2label)\n", - "\n", - " def __call__(self, pred_idx, label_idx):\n", - " return self.id2label[pred_idx], self.id2label[label_idx]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "X3GSXvYHa4KJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Next, lets get the indices of all the incorrectly labeled samples\n", - "sample_idx = 0\n", - "incorrect_preds = []\n", - "rev_map = ReverseMapLabel(test_dl)\n", - "\n", - "# Remember, evaluated_tensor = (loss, logits, labels)\n", - "probs = torch.softmax(logits, dim=-1)\n", - "probas, preds = torch.max(probs, dim=-1)\n", - "\n", - "incorrect_ids = (preds != labels).nonzero()\n", - "for idx in incorrect_ids:\n", - " proba = float(probas[idx][0])\n", - " pred = int(preds[idx][0])\n", - " label = int(labels[idx][0])\n", - " idx = int(idx[0]) + sample_idx\n", - "\n", - " incorrect_preds.append((idx, *rev_map(pred, label), proba))\n", - "\n", - "print(f\"Num test samples : {total_count.item()}\")\n", - "print(f\"Num errors : {len(incorrect_preds)}\")\n", - "\n", - "# First lets sort by confidence of prediction\n", - "incorrect_preds = sorted(incorrect_preds, key=lambda x: x[-1], reverse=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0JgGo71gcDtD", - "colab_type": "text" - }, - "source": [ - "## Examine a subset of incorrect samples\n", - "Let's print out the (test id, predicted label, ground truth label, confidence) tuple of first 20 incorrectly labeled samples" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "x37wNJsNbcw0", - "colab_type": "code", - "colab": {} - }, - "source": [ - "for incorrect_sample in incorrect_preds[:20]:\n", - " print(str(incorrect_sample))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tDnwYsDKcLv9", - "colab_type": "text" - }, - "source": [ - "## Define a threshold below which we designate a model's prediction as \"low confidence\"" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "dpvzeh4PcGJs", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Filter out how many such samples exist\n", - "low_confidence_threshold = 0.25\n", - "count_low_confidence = len(list(filter(lambda x: x[-1] <= low_confidence_threshold, incorrect_preds)))\n", - "print(f\"Number of low confidence predictions : {count_low_confidence}\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ERXyXvCAcSKR", - "colab_type": "text" - }, - "source": [ - "## Lets hear the samples which the model has least confidence in !" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kxjNVjX8cPNP", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# First lets create a helper function to parse the manifest files\n", - "def parse_manifest(manifest):\n", - " data = []\n", - " for line in manifest:\n", - " line = json.loads(line)\n", - " data.append(line)\n", - "\n", - " return data" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "IWxqw5k-cUVd", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Next, lets create a helper function to actually listen to certain samples\n", - "def listen_to_file(sample_id, pred=None, label=None, proba=None):\n", - " # Load the audio waveform using librosa\n", - " filepath = test_samples[sample_id]['audio_filepath']\n", - " audio, sample_rate = librosa.load(filepath)\n", - "\n", - " if pred is not None and label is not None and proba is not None:\n", - " print(f\"Sample : {sample_id} Prediction : {pred} Label : {label} Confidence = {proba: 0.4f}\")\n", - " else:\n", - " print(f\"Sample : {sample_id}\")\n", - "\n", - " return ipd.Audio(audio, rate=sample_rate)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "HPj1tFNIcXaU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Now lets load the test manifest into memory\n", - "test_samples = []\n", - "with open(test_dataset, 'r') as test_f:\n", - " test_samples = test_f.readlines()\n", - "\n", - "test_samples = parse_manifest(test_samples)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Nt7b_uiScZcC", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Finally, lets listen to all the audio samples where the model made a mistake\n", - "# Note: This list of incorrect samples may be quite large, so you may choose to subsample `incorrect_preds`\n", - "count = min(count_low_confidence, 20) # replace this line with just `count_low_confidence` to listen to all samples with low confidence\n", - "\n", - "for sample_id, pred, label, proba in incorrect_preds[:count]:\n", - " ipd.display(listen_to_file(sample_id, pred=pred, label=label, proba=proba))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gxLGGDvHW2kV", - "colab_type": "text" - }, - "source": [ - "# Fine-tuning on a new dataset\n", - "\n", - "We currently trained our dataset on all 30/35 classes of the Google Speech Commands dataset (v1/v2).\n", - "\n", - "We will now show an example of fine-tuning a trained model on a subset of the classes, as a demonstration of fine-tuning.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mZAPGTzeXnuQ", - "colab_type": "text" - }, - "source": [ - "## Preparing the data-subsets\n", - "\n", - "Lets select 2 of the classes, `yes` and `no` and prepare our manifests with this dataset." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "G1RI4GBNfjUW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import json" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "L3cFvN5vcbjb", - "colab_type": "code", - "colab": {} - }, - "source": [ - "def extract_subset_from_manifest(name: str, manifest_path: str, labels: list):\n", - " manifest_dir = os.path.split(manifest_path)[0]\n", - " labels = set(labels)\n", - " manifest_values = []\n", - "\n", - " print(f\"Parsing manifest: {manifest_path}\")\n", - " with open(manifest_path, 'r') as f:\n", - " for line in f:\n", - " val = json.loads(line)\n", - "\n", - " if val['command'] in labels:\n", - " manifest_values.append(val)\n", - "\n", - " print(f\"Number of files extracted from dataset: {len(manifest_values)}\")\n", - "\n", - " outpath = os.path.join(manifest_dir, name)\n", - " with open(outpath, 'w') as f:\n", - " for val in manifest_values:\n", - " json.dump(val, f)\n", - " f.write(\"\\n\")\n", - " f.flush()\n", - "\n", - " print(\"Manifest subset written to path :\", outpath)\n", - " print()\n", - "\n", - " return outpath" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "fXQ0N1evfqZ8", - "colab_type": "code", - "colab": {} - }, - "source": [ - "labels = [\"yes\", \"no\"]\n", - "\n", - "train_subdataset = extract_subset_from_manifest(\"train_subset.json\", train_dataset, labels)\n", - "val_subdataset = extract_subset_from_manifest(\"val_subset.json\", val_dataset, labels)\n", - "test_subdataset = extract_subset_from_manifest(\"test_subset.json\", test_dataset, labels)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IO5pVNyKimiE", - "colab_type": "text" - }, - "source": [ - "## Saving/Restoring a checkpoint\n", - "\n", - "There are multiple ways to save and load models in NeMo. Since all NeMo models are inherently Lightning Modules, we can use the standard way that PyTorch Lightning saves and restores models.\n", - "\n", - "NeMo also provides a more advanced model save/restore format, which encapsulates all the parts of the model that are required to restore that model for immediate use.\n", - "\n", - "In this example, we will explore both ways of saving and restoring models, but we will focus on the PyTorch Lightning method." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lMKvrT88jZwC", - "colab_type": "text" - }, - "source": [ - "### Saving and Restoring via PyTorch Lightning Checkpoints\n", - "\n", - "When using NeMo for training, it is advisable to utilize the `exp_manager` framework. It is tasked with handling checkpointing and logging (Tensorboard as well as WandB optionally!), as well as dealing with multi-node and multi-GPU logging.\n", - "\n", - "Since we utilized the `exp_manager` framework above, we have access to the directory where the checkpoints exist. \n", - "\n", - "`exp_manager` with the default settings will save multiple checkpoints for us - \n", - "\n", - "1) A few checkpoints from certain steps of training. They will have `--val_loss=` tags\n", - "\n", - "2) A checkpoint at the last epoch of training denotes by `--last`.\n", - "\n", - "3) If the model finishes training, it will also have a `--end` checkpoint." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "TcHTw5ErmQRi", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import glob" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "5h8zMJHngUrV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(exp_dir)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "F9K_Ct_hl8oU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets list all the checkpoints we have\n", - "checkpoint_dir = os.path.join(exp_dir, 'checkpoints')\n", - "checkpoint_paths = list(glob.glob(os.path.join(checkpoint_dir, \"*.ckpt\")))\n", - "checkpoint_paths" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "67fbB61umfb4", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# We want the checkpoint saved after the final step of training\n", - "final_checkpoint = list(filter(lambda x: \"--end.ckpt\" in x, checkpoint_paths))[0]\n", - "print(final_checkpoint)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZADUzv02nknZ", - "colab_type": "text" - }, - "source": [ - "### Restoring from a PyTorch Lightning checkpoint\n", - "\n", - "To restore a model using the `LightningModule.load_from_checkpoint()` class method." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ywd9Qj4Xm3VC", - "colab_type": "code", - "colab": {} - }, - "source": [ - "restored_model = nemo_asr.models.EncDecClassificationModel.load_from_checkpoint(final_checkpoint)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0f4GQa8vB1BB", - "colab_type": "text" - }, - "source": [ - "## Prepare the model for fine-tuning\n", - "\n", - "Remember, the original model was trained for a 30/35 way classification task. Now we require only a subset of these models, so we need to modify the decoder head to support fewer classes.\n", - "\n", - "We can do this easily with the convenient function `EncDecClassificationModel.change_labels(new_label_list)`.\n", - "\n", - "By performing this step, we discard the old decoder head, but still, preserve the encoder!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "iMCMds7pB16U", - "colab_type": "code", - "colab": {} - }, - "source": [ - "restored_model.change_labels(labels)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rrspQ2QFtbCK", - "colab_type": "text" - }, - "source": [ - "### Prepare the data loaders\n", - "\n", - "The restored model, upon restoration, will not attempt to set up any data loaders. \n", - "\n", - "This is so that we can manually set up any datasets we want - train and val to finetune the model, test in order to just evaluate, or all three to do both!\n", - "\n", - "The entire config that we used before can still be accessed via `ModelPT._cfg`, so we will use it in order to set up our data loaders. This also gives us the opportunity to set any additional parameters we wish to setup!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9JxhiZN5ulUl", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import copy" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qzHfTOkPowJo", - "colab_type": "code", - "colab": {} - }, - "source": [ - "train_subdataset_cfg = copy.deepcopy(restored_model._cfg.train_ds)\n", - "val_subdataset_cfg = copy.deepcopy(restored_model._cfg.validation_ds)\n", - "test_subdataset_cfg = copy.deepcopy(restored_model._cfg.test_ds)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "it9-vFX6vHUl", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Set the paths to the subset of the dataset\n", - "train_subdataset_cfg.manifest_filepath = train_subdataset\n", - "val_subdataset_cfg.manifest_filepath = val_subdataset\n", - "test_subdataset_cfg.manifest_filepath = test_subdataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "1qzWY8QDvgfc", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Setup the data loader for the restored model\n", - "restored_model.setup_training_data(train_subdataset_cfg)\n", - "restored_model.setup_multiple_validation_data(val_subdataset_cfg)\n", - "restored_model.setup_multiple_test_data(test_subdataset_cfg)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "y8GZ5a5rC0gY", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Check data loaders are correct\n", - "print(\"Train dataset labels :\", restored_model._train_dl.dataset.labels)\n", - "print(\"Val dataset labels :\", restored_model._validation_dl.dataset.labels)\n", - "print(\"Test dataset labels :\", restored_model._test_dl.dataset.labels)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "76yDcWZ9zl2G", - "colab_type": "text" - }, - "source": [ - "## Setting up a new Trainer and Experiment Manager\n", - "\n", - "A restored model has a utility method to attach the Trainer object to it, which is necessary in order to correctly set up the optimizer and scheduler!\n", - "\n", - "**Note**: The restored model does not contain the trainer config with it. It is necessary to create a new Trainer object suitable for the environment where the model is being trained. The template can be replicated from any of the training scripts.\n", - "\n", - "Here, since we already had the previous config object that prepared the trainer, we could have used it, but for demonstration, we will set up the trainer config manually." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "swTe3WvBzkBJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Setup the new trainer object\n", - "# Lets modify some trainer configs for this demo\n", - "# Checks if we have GPU available and uses it\n", - "cuda = 1 if torch.cuda.is_available() else 0\n", - "\n", - "trainer_config = OmegaConf.create(dict(\n", - " gpus=cuda,\n", - " max_epochs=5,\n", - " max_steps=None, # computed at runtime if not set\n", - " num_nodes=1,\n", - " accumulate_grad_batches=1,\n", - " checkpoint_callback=False, # Provided by exp_manager\n", - " logger=False, # Provided by exp_manager\n", - " row_log_interval=1, # Interval of logging.\n", - " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n", - "))\n", - "print(trainer_config.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Nd_ej4bI3TIy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer_finetune = pl.Trainer(**trainer_config)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WtGu5q5T32XA", - "colab_type": "text" - }, - "source": [ - "### Setting the trainer to the restored model\n", - "\n", - "All NeMo models provide a convenience method `set_trainer()` in order to setup the trainer after restoration" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BTozhedA3zpM", - "colab_type": "code", - "colab": {} - }, - "source": [ - "restored_model.set_trainer(trainer_finetune)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XojTpEiI3TQa", - "colab_type": "code", - "colab": {} - }, - "source": [ - "exp_dir_finetune = exp_manager(trainer_finetune, config.get(\"exp_manager\", None))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "x_LSbmCQ3TUf", - "colab_type": "code", - "colab": {} - }, - "source": [ - "exp_dir_finetune = str(exp_dir_finetune)\n", - "exp_dir_finetune" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QT_mWWnSxPLv", - "colab_type": "text" - }, - "source": [ - "## Setup optimizer + scheduler\n", - "\n", - "For a fine-tuning experiment, lets set up the optimizer and scheduler!\n", - "\n", - "We will use a much lower learning rate than before, and also swap out the scheduler from PolyHoldDecay to CosineDecay." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "TugHsePsxA5Q", - "colab_type": "code", - "colab": {} - }, - "source": [ - "optim_sched_cfg = copy.deepcopy(restored_model._cfg.optim)\n", - "# Struct mode prevents us from popping off elements from the config, so lets disable it\n", - "OmegaConf.set_struct(optim_sched_cfg, False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "pZSo0sWPxwiG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets change the maximum learning rate to previous minimum learning rate\n", - "optim_sched_cfg.lr = 0.001\n", - "\n", - "# Lets change the scheduler\n", - "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n", - "\n", - "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n", - "optim_sched_cfg.sched.pop('power')\n", - "\n", - "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n", - "optim_sched_cfg.sched.pop('hold_ratio')\n", - "\n", - "# Set \"min_lr\" to lower value\n", - "optim_sched_cfg.sched.min_lr = 1e-4\n", - "\n", - "print(optim_sched_cfg.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "FqqyFF3Ey5If", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Now lets update the optimizer settings\n", - "restored_model.setup_optimization(optim_sched_cfg)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mdivgIPUzgP_", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# We can also just directly replace the config inplace if we choose to\n", - "restored_model._cfg.optim = optim_sched_cfg" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3-lRyz2_Eyrl", - "colab_type": "text" - }, - "source": [ - "## Fine-tune training step\n", - "\n", - "We fine-tune on the subset classification problem. Note, the model was originally trained on these classes (the subset defined here has already been trained on above).\n", - "\n", - "When fine-tuning on a truly new dataset, we will not see such a dramatic improvement in performance. However, it should still converge a little faster than if it was trained from scratch." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nq-iHIgx6OId", - "colab_type": "text" - }, - "source": [ - "### Monitor training progress via Tensorboard\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PIacDWcD5vCR", - "colab_type": "code", - "colab": {} - }, - "source": [ - "%tensorboard --logdir {exp_dir_finetune}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r5_z1eW76fip", - "colab_type": "text" - }, - "source": [ - "### Fine-tuning for 5 epochs" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WH8rN6dA6V9S", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer_finetune.fit(restored_model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lgV0s8auJpxV", - "colab_type": "text" - }, - "source": [ - "### Evaluation on the Test set\n", - "\n", - "Lets compute the final score on the test set via `trainer.test(model)`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "szpLp6XTDPaK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer_finetune.test(restored_model, ckpt_path=None)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uNBAaf1FKcAZ", - "colab_type": "text" - }, - "source": [ - "## Advanced Usage: Exporting a model in its entirety\n", - "\n", - "While most models can be easily serialized via the Experiment Manager as a PyTorch Lightning checkpoint, there are certain models where this is insufficient. \n", - "\n", - "Consider the case where a Model contains artifacts such as tokenizers or other intermediate file objects that cannot be so easily serialized into a checkpoint.\n", - "\n", - "For such cases, NeMo offers two utility functions that enable serialization of a Model + artifacts - `save_to` and `restore_from`.\n", - "\n", - "Further documentation regarding these methods can be obtained from the documentation pages on NeMo." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Dov9g2j8Lyjs", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import tarfile" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "WNixPPFNJyNc", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Save a model as a tarfile\n", - "restored_model.save_to(os.path.join(exp_dir_finetune, \"model.nemo\"))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "B2RHYNjjLrcW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# The above object is just a tarfile which can store additional artifacts.\n", - "with tarfile.open(os.path.join(exp_dir_finetune, 'model.nemo')) as blob:\n", - " for item in blob:\n", - " print(item)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "fRo04x3TLxdu", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Restore a model from a tarfile\n", - "restored_model_2 = nemo_asr.models.EncDecClassificationModel.restore_from(os.path.join(exp_dir_finetune, \"model.nemo\"))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LyIegk2CPNsI", - "colab_type": "text" - }, - "source": [ - "## Conclusion\n", - "Once the model has been restored, either via a PyTorch Lightning checkpoint or via the `restore_from` methods, one can finetune by following the above general steps." - ] - } - ] -} \ No newline at end of file + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "R12Yn6W1dt9t" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "\n", + "## Install dependencies\n", + "!pip install wget\n", + "!apt-get install sox libsndfile1 ffmpeg\n", + "!pip install unidecode\n", + "\n", + "# ## Install NeMo\n", + "!python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]\n", + "\n", + "## Install TorchAudio\n", + "!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html\n", + "\n", + "## Grab the config we'll use in this example\n", + "!mkdir configs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "J6ycGIaZfSLE" + }, + "source": [ + "# Introduction\n", + "\n", + "This Speech Command recognition tutorial is based on the MatchboxNet model from the paper [\"MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition\"](https://arxiv.org/abs/2004.08531). MatchboxNet is a modified form of the QuartzNet architecture from the paper \"[QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/pdf/1910.10261.pdf)\" with a modified decoder head to suit classification tasks.\n", + "\n", + "The notebook will follow the steps below:\n", + "\n", + " - Dataset preparation: Preparing Google Speech Commands dataset\n", + "\n", + " - Audio preprocessing (feature extraction): signal normalization, windowing, (log) spectrogram (or mel scale spectrogram, or MFCC)\n", + "\n", + " - Data augmentation using SpecAugment \"[SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779)\" to increase the number of data samples.\n", + " \n", + " - Develop a small Neural classification model that can be trained efficiently.\n", + " \n", + " - Model training on the Google Speech Commands dataset in NeMo.\n", + " \n", + " - Evaluation of error cases of the model by audibly hearing the samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "I62_LJzc-p2b" + }, + "outputs": [], + "source": [ + "# Some utility imports\n", + "import os\n", + "from omegaconf import OmegaConf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "K_M8wpkwd7d7" + }, + "outputs": [], + "source": [ + "# This is where the Google Speech Commands directory will be placed.\n", + "# Change this if you don't want the data to be extracted in the current directory.\n", + "# Select the version of the dataset required as well (can be 1 or 2)\n", + "DATASET_VER = 1\n", + "data_dir = './google_dataset_v{0}/'.format(DATASET_VER)\n", + "\n", + "if DATASET_VER == 1:\n", + " MODEL_CONFIG = \"matchboxnet_3x1x64_v1.yaml\"\n", + "else:\n", + " MODEL_CONFIG = \"matchboxnet_3x1x64_v2.yaml\"\n", + "\n", + "if not os.path.exists(f\"configs/{MODEL_CONFIG}\"):\n", + " !wget -P configs/ \"https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/asr/conf/{MODEL_CONFIG}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tvfwv9Hjf1Uv" + }, + "source": [ + "# Data Preparation\n", + "\n", + "We will be using the open-source Google Speech Commands Dataset (we will use V1 of the dataset for the tutorial but require minor changes to support the V2 dataset). These scripts below will download the dataset and convert it to a format suitable for use with NeMo." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6VL10OXTf8ts" + }, + "source": [ + "## Download the dataset\n", + "\n", + "The dataset must be prepared using the scripts provided under the `{NeMo root directory}/scripts` sub-directory. \n", + "\n", + "Run the following command below to download the data preparation script and execute it.\n", + "\n", + "**NOTE**: You should have at least 4GB of disk space available if you’ve used --data_version=1; and at least 6GB if you used --data_version=2. Also, it will take some time to download and process, so go grab a coffee.\n", + "\n", + "**NOTE**: You may additionally pass a `--rebalance` flag at the end of the `process_speech_commands_data.py` script to rebalance the class samples in the manifest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oqKe6_uLfzKU" + }, + "outputs": [], + "source": [ + "if not os.path.exists(\"process_speech_commands_data.py\"):\n", + " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/scripts/process_speech_commands_data.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "TTsxp0nZ1zqo" + }, + "source": [ + "### Preparing the manifest file\n", + "\n", + "The manifest file is a simple file that has the full path to the audio file, the duration of the audio file, and the label that is assigned to that audio file. \n", + "\n", + "This notebook is only a demonstration, and therefore we will use the `--skip_duration` flag to speed up construction of the manifest file.\n", + "\n", + "**NOTE: When replicating the results of the paper, do not use this flag and prepare the manifest file with correct durations.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cWUtDpzKgop9" + }, + "outputs": [], + "source": [ + "!mkdir {data_dir}\n", + "!python process_speech_commands_data.py --data_root={data_dir} --data_version={DATASET_VER} --skip_duration --log\n", + "print(\"Dataset ready !\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eVsPFxJtg30p" + }, + "source": [ + "## Prepare the path to manifest files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ytTFGVe0g9wk" + }, + "outputs": [], + "source": [ + "dataset_path = 'google_speech_recognition_v{0}'.format(DATASET_VER)\n", + "dataset_basedir = os.path.join(data_dir, dataset_path)\n", + "\n", + "train_dataset = os.path.join(dataset_basedir, 'train_manifest.json')\n", + "val_dataset = os.path.join(dataset_basedir, 'validation_manifest.json')\n", + "test_dataset = os.path.join(dataset_basedir, 'validation_manifest.json')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s0SZy9SEhOBf" + }, + "source": [ + "## Read a few rows of the manifest file \n", + "\n", + "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", + "\n", + "1) `audio_filepath`: Refers to the path to the raw audio file
\n", + "2) `command`: The class label (or speech command) of this sample
\n", + "3) `duration`: The length of the audio file, in seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HYBidCMIhKQV" + }, + "outputs": [], + "source": [ + "!head -n 5 {train_dataset}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "r-pyUBedh8f4" + }, + "source": [ + "# Training - Preparation\n", + "\n", + "We will be training a MatchboxNet model from the paper [\"MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition\"](https://arxiv.org/abs/2004.08531). The benefit of MatchboxNet over JASPER models is that they use 1D Time-Channel Separable Convolutions, which greatly reduce the number of parameters required to obtain good model accuracy.\n", + "\n", + "MatchboxNet models generally follow the model definition pattern QuartzNet-[BxRXC], where B is the number of blocks, R is the number of convolutional sub-blocks, and C is the number of channels in these blocks. Each sub-block contains a 1-D masked convolution, batch normalization, ReLU, and dropout.\n", + "\n", + "An image of QuartzNet, the base configuration of MatchboxNet models, is provided below.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "T0sV4riijHJF" + }, + "source": [ + "

\n", + " \n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ieAPOM9thTN2" + }, + "outputs": [], + "source": [ + "# NeMo's \"core\" package\n", + "import nemo\n", + "# NeMo's ASR collection - this collections contains complete ASR models and\n", + "# building blocks (modules) for ASR\n", + "import nemo.collections.asr as nemo_asr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ss9gLcDv30jI" + }, + "source": [ + "## Model Configuration\n", + "The MatchboxNet Model is defined in a config file which declares multiple important sections.\n", + "\n", + "They are:\n", + "\n", + "1) `model`: All arguments that will relate to the Model - preprocessors, encoder, decoder, optimizer and schedulers, datasets and any other related information\n", + "\n", + "2) `trainer`: Any argument to be passed to PyTorch Lightning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yoVAs9h1lfci" + }, + "outputs": [], + "source": [ + "# This line will print the entire config of the MatchboxNet model\n", + "config_path = f\"configs/{MODEL_CONFIG}\"\n", + "config = OmegaConf.load(config_path)\n", + "print(config.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "m2lJPR0a3qww" + }, + "outputs": [], + "source": [ + "# Preserve some useful parameters\n", + "labels = config.model.labels\n", + "sample_rate = config.sample_rate" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8_pmjeed78rJ" + }, + "source": [ + "### Setting up the datasets within the config\n", + "\n", + "If you'll notice, there are a few config dictionaries called `train_ds`, `validation_ds` and `test_ds`. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DIe6Qfs18MiQ" + }, + "outputs": [], + "source": [ + "print(config.model.train_ds.pretty())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Fb01hl868Uc3" + }, + "source": [ + "### `???` inside configs\n", + "\n", + "You will often notice that some configs have `???` in place of paths. This is used as a placeholder so that the user can change the value at a later time.\n", + "\n", + "Let's add the paths to the manifests to the config above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "m181HXev8T97" + }, + "outputs": [], + "source": [ + "config.model.train_ds.manifest_filepath = train_dataset\n", + "config.model.validation_ds.manifest_filepath = val_dataset\n", + "config.model.test_ds.manifest_filepath = test_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pbXngoCM5IRG" + }, + "source": [ + "## Building the PyTorch Lightning Trainer\n", + "\n", + "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem!\n", + "\n", + "Lets first instantiate a Trainer object!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bYtvdBlG5afU" + }, + "outputs": [], + "source": [ + "import torch\n", + "import pytorch_lightning as pl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jRN18CdH51nN" + }, + "outputs": [], + "source": [ + "print(\"Trainer config - \\n\")\n", + "print(config.trainer.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gHf6cHvm6H9b" + }, + "outputs": [], + "source": [ + "# Lets modify some trainer configs for this demo\n", + "# Checks if we have GPU available and uses it\n", + "cuda = 1 if torch.cuda.is_available() else 0\n", + "config.trainer.gpus = cuda\n", + "\n", + "# Reduces maximum number of epochs to 5 for quick demonstration\n", + "config.trainer.max_epochs = 5\n", + "\n", + "# Remove distributed training flags\n", + "config.trainer.distributed_backend = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "UB9nr7G56G3L" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(**config.trainer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2wt603Vq6sqX" + }, + "source": [ + "## Setting up a NeMo Experiment\n", + "\n", + "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it ! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TfWJFg7p6Ezf" + }, + "outputs": [], + "source": [ + "from nemo.utils.exp_manager import exp_manager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "SC-QPoW44-p2" + }, + "outputs": [], + "source": [ + "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Yqi6rkNR7Dph" + }, + "outputs": [], + "source": [ + "# The exp_dir provides a path to the current experiment for easy access\n", + "exp_dir = str(exp_dir)\n", + "exp_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "t0zz-vHH7Uuh" + }, + "source": [ + "## Building the MatchboxNet Model\n", + "\n", + "MatchboxNet is an ASR model with a classification task - it generates one label for the entire provided audio stream. Therefore we encapsulate it inside the `EncDecClassificationModel` as follows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FRMrKhyf5vhy" + }, + "outputs": [], + "source": [ + "asr_model = nemo_asr.models.EncDecClassificationModel(cfg=config.model, trainer=trainer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jA9UND-Q_oyw" + }, + "source": [ + "# Training a MatchboxNet Model\n", + "\n", + "As MatchboxNet is inherently a PyTorch Lightning Model, it can easily be trained in a single line - `trainer.fit(model)` !" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3ngKcRFqBfIF" + }, + "source": [ + "### Monitoring training progress\n", + "\n", + "Before we begin training, lets first create a Tensorboard visualization to monitor progress\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Cyfec0PDBsXa" + }, + "outputs": [], + "source": [ + "# Load the TensorBoard notebook extension\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4L5ymu-QBxmz" + }, + "outputs": [], + "source": [ + "%tensorboard --logdir {exp_dir}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZApuELDIKQgC" + }, + "source": [ + "### Training for 5 epochs\n", + "We see below that the model begins to get modest scores on the validation set after just 5 epochs of training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9xiUUJlH5KdD" + }, + "outputs": [], + "source": [ + "trainer.fit(asr_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Dkds1jSvKgSc" + }, + "source": [ + "### Evaluation on the Test set\n", + "\n", + "Lets compute the final score on the test set via `trainer.test(model)`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mULTrhEJ_6wV" + }, + "outputs": [], + "source": [ + "trainer.test(asr_model, ckpt_path=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XQntce8cLiUC" + }, + "source": [ + "# Fast Training\n", + "\n", + "We can dramatically improve the time taken to train this model by using Multi GPU training along with Mixed Precision.\n", + "\n", + "For multi-GPU training, take a look at [the PyTorch Lightning Multi-GPU training section](https://pytorch-lightning.readthedocs.io/en/latest/multi_gpu.html)\n", + "\n", + "For mixed-precision training, take a look at [the PyTorch Lightning Mixed-Precision training section](https://pytorch-lightning.readthedocs.io/en/latest/apex.html)\n", + "\n", + "```python\n", + "# Mixed precision:\n", + "trainer = Trainer(amp_level='O1', precision=16)\n", + "\n", + "# Trainer with a distributed backend:\n", + "trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", + "\n", + "# Of course, you can combine these flags as well.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ifDHkunjM8y6" + }, + "source": [ + "# Evaluation of incorrectly predicted samples\n", + "\n", + "Given that we have a trained model, which performs reasonably well, let's try to listen to the samples where the model is least confident in its predictions.\n", + "\n", + "For this, we need the support of the librosa library.\n", + "\n", + "**NOTE**: The following code depends on librosa. To install it, run the following code block first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "s3w3LhHcKuD2" + }, + "outputs": [], + "source": [ + "!pip install librosa" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PcJrZ72sNCkM" + }, + "source": [ + "## Extract the predictions from the model\n", + "\n", + "We want to possess the actual logits of the model instead of just the final evaluation score, so we can define a function to perform the forward step for us without computing the final loss. Instead, we extract the logits per batch of samples provided." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rvxdviYtOFjK" + }, + "source": [ + "## Accessing the data loaders\n", + "\n", + "We can utilize the `setup_test_data` method in order to instantiate a data loader for the dataset we want to analyze.\n", + "\n", + "For convenience, we can access these instantiated data loaders using the following accessors - `asr_model._train_dl`, `asr_model._validation_dl` and `asr_model._test_dl`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CB0QZCAmM656" + }, + "outputs": [], + "source": [ + "asr_model.setup_test_data(config.model.test_ds)\n", + "test_dl = asr_model._test_dl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rA7gXawcPoip" + }, + "source": [ + "## Partial Test Step\n", + "\n", + "Below we define a utility function to perform most of the test step. For reference, the test step is defined as follows:\n", + "\n", + "```python\n", + " def test_step(self, batch, batch_idx, dataloader_idx=0):\n", + " audio_signal, audio_signal_len, labels, labels_len = batch\n", + " logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)\n", + " loss_value = self.loss(logits=logits, labels=labels)\n", + " correct_counts, total_counts = self._accuracy(logits=logits, labels=labels)\n", + " return {'test_loss': loss_value, 'test_correct_counts': correct_counts, 'test_total_counts': total_counts}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sBsDOm5ROpQI" + }, + "outputs": [], + "source": [ + "@torch.no_grad()\n", + "def extract_logits(model, dataloader):\n", + " logits_buffer = []\n", + " label_buffer = []\n", + "\n", + " # Follow the above definition of the test_step\n", + " for batch in dataloader:\n", + " audio_signal, audio_signal_len, labels, labels_len = batch\n", + " logits = model(input_signal=audio_signal, input_signal_length=audio_signal_len)\n", + "\n", + " logits_buffer.append(logits)\n", + " label_buffer.append(labels)\n", + " print(\".\", end='')\n", + " print()\n", + " \n", + " print(\"Finished extracting logits !\")\n", + " logits = torch.cat(logits_buffer, 0)\n", + " labels = torch.cat(label_buffer, 0)\n", + " return logits, labels\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mZSdprUlOuoV" + }, + "outputs": [], + "source": [ + "cpu_model = asr_model.cpu()\n", + "cpu_model.eval()\n", + "logits, labels = extract_logits(cpu_model, test_dl)\n", + "print(\"Logits:\", logits.shape, \"Labels :\", labels.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9Wd0ukgNXRBz" + }, + "outputs": [], + "source": [ + "# Compute accuracy - `_accuracy` is a PyTorch Lightning Metric !\n", + "correct_count, total_count = cpu_model._accuracy(logits=logits, labels=labels)\n", + "print(\"Accuracy : \", float(correct_count * 100.) / float(total_count))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NwN9OSqCauSH" + }, + "source": [ + "## Filtering out incorrect samples\n", + "Let us now filter out the incorrectly labeled samples from the total set of samples in the test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "N1YJvsmcZ0uE" + }, + "outputs": [], + "source": [ + "import librosa\n", + "import json\n", + "import IPython.display as ipd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jZAT9yGAayvR" + }, + "outputs": [], + "source": [ + "# First lets create a utility class to remap the integer class labels to actual string label\n", + "class ReverseMapLabel:\n", + " def __init__(self, data_loader):\n", + " self.label2id = dict(data_loader.dataset.label2id)\n", + " self.id2label = dict(data_loader.dataset.id2label)\n", + "\n", + " def __call__(self, pred_idx, label_idx):\n", + " return self.id2label[pred_idx], self.id2label[label_idx]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "X3GSXvYHa4KJ" + }, + "outputs": [], + "source": [ + "# Next, lets get the indices of all the incorrectly labeled samples\n", + "sample_idx = 0\n", + "incorrect_preds = []\n", + "rev_map = ReverseMapLabel(test_dl)\n", + "\n", + "# Remember, evaluated_tensor = (loss, logits, labels)\n", + "probs = torch.softmax(logits, dim=-1)\n", + "probas, preds = torch.max(probs, dim=-1)\n", + "\n", + "incorrect_ids = (preds != labels).nonzero()\n", + "for idx in incorrect_ids:\n", + " proba = float(probas[idx][0])\n", + " pred = int(preds[idx][0])\n", + " label = int(labels[idx][0])\n", + " idx = int(idx[0]) + sample_idx\n", + "\n", + " incorrect_preds.append((idx, *rev_map(pred, label), proba))\n", + "\n", + "print(f\"Num test samples : {total_count.item()}\")\n", + "print(f\"Num errors : {len(incorrect_preds)}\")\n", + "\n", + "# First lets sort by confidence of prediction\n", + "incorrect_preds = sorted(incorrect_preds, key=lambda x: x[-1], reverse=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0JgGo71gcDtD" + }, + "source": [ + "## Examine a subset of incorrect samples\n", + "Let's print out the (test id, predicted label, ground truth label, confidence) tuple of first 20 incorrectly labeled samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "x37wNJsNbcw0" + }, + "outputs": [], + "source": [ + "for incorrect_sample in incorrect_preds[:20]:\n", + " print(str(incorrect_sample))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tDnwYsDKcLv9" + }, + "source": [ + "## Define a threshold below which we designate a model's prediction as \"low confidence\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dpvzeh4PcGJs" + }, + "outputs": [], + "source": [ + "# Filter out how many such samples exist\n", + "low_confidence_threshold = 0.25\n", + "count_low_confidence = len(list(filter(lambda x: x[-1] <= low_confidence_threshold, incorrect_preds)))\n", + "print(f\"Number of low confidence predictions : {count_low_confidence}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ERXyXvCAcSKR" + }, + "source": [ + "## Lets hear the samples which the model has least confidence in !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kxjNVjX8cPNP" + }, + "outputs": [], + "source": [ + "# First lets create a helper function to parse the manifest files\n", + "def parse_manifest(manifest):\n", + " data = []\n", + " for line in manifest:\n", + " line = json.loads(line)\n", + " data.append(line)\n", + "\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IWxqw5k-cUVd" + }, + "outputs": [], + "source": [ + "# Next, lets create a helper function to actually listen to certain samples\n", + "def listen_to_file(sample_id, pred=None, label=None, proba=None):\n", + " # Load the audio waveform using librosa\n", + " filepath = test_samples[sample_id]['audio_filepath']\n", + " audio, sample_rate = librosa.load(filepath)\n", + "\n", + " if pred is not None and label is not None and proba is not None:\n", + " print(f\"Sample : {sample_id} Prediction : {pred} Label : {label} Confidence = {proba: 0.4f}\")\n", + " else:\n", + " print(f\"Sample : {sample_id}\")\n", + "\n", + " return ipd.Audio(audio, rate=sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HPj1tFNIcXaU" + }, + "outputs": [], + "source": [ + "# Now lets load the test manifest into memory\n", + "test_samples = []\n", + "with open(test_dataset, 'r') as test_f:\n", + " test_samples = test_f.readlines()\n", + "\n", + "test_samples = parse_manifest(test_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Nt7b_uiScZcC" + }, + "outputs": [], + "source": [ + "# Finally, lets listen to all the audio samples where the model made a mistake\n", + "# Note: This list of incorrect samples may be quite large, so you may choose to subsample `incorrect_preds`\n", + "count = min(count_low_confidence, 20) # replace this line with just `count_low_confidence` to listen to all samples with low confidence\n", + "\n", + "for sample_id, pred, label, proba in incorrect_preds[:count]:\n", + " ipd.display(listen_to_file(sample_id, pred=pred, label=label, proba=proba))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gxLGGDvHW2kV" + }, + "source": [ + "# Fine-tuning on a new dataset\n", + "\n", + "We currently trained our dataset on all 30/35 classes of the Google Speech Commands dataset (v1/v2).\n", + "\n", + "We will now show an example of fine-tuning a trained model on a subset of the classes, as a demonstration of fine-tuning.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mZAPGTzeXnuQ" + }, + "source": [ + "## Preparing the data-subsets\n", + "\n", + "Lets select 2 of the classes, `yes` and `no` and prepare our manifests with this dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "G1RI4GBNfjUW" + }, + "outputs": [], + "source": [ + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "L3cFvN5vcbjb" + }, + "outputs": [], + "source": [ + "def extract_subset_from_manifest(name: str, manifest_path: str, labels: list):\n", + " manifest_dir = os.path.split(manifest_path)[0]\n", + " labels = set(labels)\n", + " manifest_values = []\n", + "\n", + " print(f\"Parsing manifest: {manifest_path}\")\n", + " with open(manifest_path, 'r') as f:\n", + " for line in f:\n", + " val = json.loads(line)\n", + "\n", + " if val['command'] in labels:\n", + " manifest_values.append(val)\n", + "\n", + " print(f\"Number of files extracted from dataset: {len(manifest_values)}\")\n", + "\n", + " outpath = os.path.join(manifest_dir, name)\n", + " with open(outpath, 'w') as f:\n", + " for val in manifest_values:\n", + " json.dump(val, f)\n", + " f.write(\"\\n\")\n", + " f.flush()\n", + "\n", + " print(\"Manifest subset written to path :\", outpath)\n", + " print()\n", + "\n", + " return outpath" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fXQ0N1evfqZ8" + }, + "outputs": [], + "source": [ + "labels = [\"yes\", \"no\"]\n", + "\n", + "train_subdataset = extract_subset_from_manifest(\"train_subset.json\", train_dataset, labels)\n", + "val_subdataset = extract_subset_from_manifest(\"val_subset.json\", val_dataset, labels)\n", + "test_subdataset = extract_subset_from_manifest(\"test_subset.json\", test_dataset, labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "IO5pVNyKimiE" + }, + "source": [ + "## Saving/Restoring a checkpoint\n", + "\n", + "There are multiple ways to save and load models in NeMo. Since all NeMo models are inherently Lightning Modules, we can use the standard way that PyTorch Lightning saves and restores models.\n", + "\n", + "NeMo also provides a more advanced model save/restore format, which encapsulates all the parts of the model that are required to restore that model for immediate use.\n", + "\n", + "In this example, we will explore both ways of saving and restoring models, but we will focus on the PyTorch Lightning method." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "lMKvrT88jZwC" + }, + "source": [ + "### Saving and Restoring via PyTorch Lightning Checkpoints\n", + "\n", + "When using NeMo for training, it is advisable to utilize the `exp_manager` framework. It is tasked with handling checkpointing and logging (Tensorboard as well as WandB optionally!), as well as dealing with multi-node and multi-GPU logging.\n", + "\n", + "Since we utilized the `exp_manager` framework above, we have access to the directory where the checkpoints exist. \n", + "\n", + "`exp_manager` with the default settings will save multiple checkpoints for us - \n", + "\n", + "1) A few checkpoints from certain steps of training. They will have `--val_loss=` tags\n", + "\n", + "2) A checkpoint at the last epoch of training denotes by `--last`.\n", + "\n", + "3) If the model finishes training, it will also have a `--end` checkpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TcHTw5ErmQRi" + }, + "outputs": [], + "source": [ + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5h8zMJHngUrV" + }, + "outputs": [], + "source": [ + "print(exp_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "F9K_Ct_hl8oU" + }, + "outputs": [], + "source": [ + "# Lets list all the checkpoints we have\n", + "checkpoint_dir = os.path.join(exp_dir, 'checkpoints')\n", + "checkpoint_paths = list(glob.glob(os.path.join(checkpoint_dir, \"*.ckpt\")))\n", + "checkpoint_paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "67fbB61umfb4" + }, + "outputs": [], + "source": [ + "# We want the checkpoint saved after the final step of training\n", + "final_checkpoint = list(filter(lambda x: \"--end.ckpt\" in x, checkpoint_paths))[0]\n", + "print(final_checkpoint)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZADUzv02nknZ" + }, + "source": [ + "### Restoring from a PyTorch Lightning checkpoint\n", + "\n", + "To restore a model using the `LightningModule.load_from_checkpoint()` class method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ywd9Qj4Xm3VC" + }, + "outputs": [], + "source": [ + "restored_model = nemo_asr.models.EncDecClassificationModel.load_from_checkpoint(final_checkpoint)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0f4GQa8vB1BB" + }, + "source": [ + "## Prepare the model for fine-tuning\n", + "\n", + "Remember, the original model was trained for a 30/35 way classification task. Now we require only a subset of these models, so we need to modify the decoder head to support fewer classes.\n", + "\n", + "We can do this easily with the convenient function `EncDecClassificationModel.change_labels(new_label_list)`.\n", + "\n", + "By performing this step, we discard the old decoder head, but still, preserve the encoder!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "iMCMds7pB16U" + }, + "outputs": [], + "source": [ + "restored_model.change_labels(labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rrspQ2QFtbCK" + }, + "source": [ + "### Prepare the data loaders\n", + "\n", + "The restored model, upon restoration, will not attempt to set up any data loaders. \n", + "\n", + "This is so that we can manually set up any datasets we want - train and val to finetune the model, test in order to just evaluate, or all three to do both!\n", + "\n", + "The entire config that we used before can still be accessed via `ModelPT._cfg`, so we will use it in order to set up our data loaders. This also gives us the opportunity to set any additional parameters we wish to setup!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9JxhiZN5ulUl" + }, + "outputs": [], + "source": [ + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "qzHfTOkPowJo" + }, + "outputs": [], + "source": [ + "train_subdataset_cfg = copy.deepcopy(restored_model._cfg.train_ds)\n", + "val_subdataset_cfg = copy.deepcopy(restored_model._cfg.validation_ds)\n", + "test_subdataset_cfg = copy.deepcopy(restored_model._cfg.test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "it9-vFX6vHUl" + }, + "outputs": [], + "source": [ + "# Set the paths to the subset of the dataset\n", + "train_subdataset_cfg.manifest_filepath = train_subdataset\n", + "val_subdataset_cfg.manifest_filepath = val_subdataset\n", + "test_subdataset_cfg.manifest_filepath = test_subdataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1qzWY8QDvgfc" + }, + "outputs": [], + "source": [ + "# Setup the data loader for the restored model\n", + "restored_model.setup_training_data(train_subdataset_cfg)\n", + "restored_model.setup_multiple_validation_data(val_subdataset_cfg)\n", + "restored_model.setup_multiple_test_data(test_subdataset_cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "y8GZ5a5rC0gY" + }, + "outputs": [], + "source": [ + "# Check data loaders are correct\n", + "print(\"Train dataset labels :\", restored_model._train_dl.dataset.labels)\n", + "print(\"Val dataset labels :\", restored_model._validation_dl.dataset.labels)\n", + "print(\"Test dataset labels :\", restored_model._test_dl.dataset.labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "76yDcWZ9zl2G" + }, + "source": [ + "## Setting up a new Trainer and Experiment Manager\n", + "\n", + "A restored model has a utility method to attach the Trainer object to it, which is necessary in order to correctly set up the optimizer and scheduler!\n", + "\n", + "**Note**: The restored model does not contain the trainer config with it. It is necessary to create a new Trainer object suitable for the environment where the model is being trained. The template can be replicated from any of the training scripts.\n", + "\n", + "Here, since we already had the previous config object that prepared the trainer, we could have used it, but for demonstration, we will set up the trainer config manually." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "swTe3WvBzkBJ" + }, + "outputs": [], + "source": [ + "# Setup the new trainer object\n", + "# Lets modify some trainer configs for this demo\n", + "# Checks if we have GPU available and uses it\n", + "cuda = 1 if torch.cuda.is_available() else 0\n", + "\n", + "trainer_config = OmegaConf.create(dict(\n", + " gpus=cuda,\n", + " max_epochs=5,\n", + " max_steps=None, # computed at runtime if not set\n", + " num_nodes=1,\n", + " accumulate_grad_batches=1,\n", + " checkpoint_callback=False, # Provided by exp_manager\n", + " logger=False, # Provided by exp_manager\n", + " row_log_interval=1, # Interval of logging.\n", + " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n", + "))\n", + "print(trainer_config.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Nd_ej4bI3TIy" + }, + "outputs": [], + "source": [ + "trainer_finetune = pl.Trainer(**trainer_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WtGu5q5T32XA" + }, + "source": [ + "### Setting the trainer to the restored model\n", + "\n", + "All NeMo models provide a convenience method `set_trainer()` in order to setup the trainer after restoration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BTozhedA3zpM" + }, + "outputs": [], + "source": [ + "restored_model.set_trainer(trainer_finetune)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "XojTpEiI3TQa" + }, + "outputs": [], + "source": [ + "exp_dir_finetune = exp_manager(trainer_finetune, config.get(\"exp_manager\", None))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "x_LSbmCQ3TUf" + }, + "outputs": [], + "source": [ + "exp_dir_finetune = str(exp_dir_finetune)\n", + "exp_dir_finetune" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QT_mWWnSxPLv" + }, + "source": [ + "## Setup optimizer + scheduler\n", + "\n", + "For a fine-tuning experiment, lets set up the optimizer and scheduler!\n", + "\n", + "We will use a much lower learning rate than before, and also swap out the scheduler from PolyHoldDecay to CosineDecay." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TugHsePsxA5Q" + }, + "outputs": [], + "source": [ + "optim_sched_cfg = copy.deepcopy(restored_model._cfg.optim)\n", + "# Struct mode prevents us from popping off elements from the config, so lets disable it\n", + "OmegaConf.set_struct(optim_sched_cfg, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pZSo0sWPxwiG" + }, + "outputs": [], + "source": [ + "# Lets change the maximum learning rate to previous minimum learning rate\n", + "optim_sched_cfg.lr = 0.001\n", + "\n", + "# Lets change the scheduler\n", + "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n", + "\n", + "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n", + "optim_sched_cfg.sched.pop('power')\n", + "\n", + "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n", + "optim_sched_cfg.sched.pop('hold_ratio')\n", + "\n", + "# Set \"min_lr\" to lower value\n", + "optim_sched_cfg.sched.min_lr = 1e-4\n", + "\n", + "print(optim_sched_cfg.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FqqyFF3Ey5If" + }, + "outputs": [], + "source": [ + "# Now lets update the optimizer settings\n", + "restored_model.setup_optimization(optim_sched_cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mdivgIPUzgP_" + }, + "outputs": [], + "source": [ + "# We can also just directly replace the config inplace if we choose to\n", + "restored_model._cfg.optim = optim_sched_cfg" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3-lRyz2_Eyrl" + }, + "source": [ + "## Fine-tune training step\n", + "\n", + "We fine-tune on the subset classification problem. Note, the model was originally trained on these classes (the subset defined here has already been trained on above).\n", + "\n", + "When fine-tuning on a truly new dataset, we will not see such a dramatic improvement in performance. However, it should still converge a little faster than if it was trained from scratch." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nq-iHIgx6OId" + }, + "source": [ + "### Monitor training progress via Tensorboard\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PIacDWcD5vCR" + }, + "outputs": [], + "source": [ + "%tensorboard --logdir {exp_dir_finetune}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "r5_z1eW76fip" + }, + "source": [ + "### Fine-tuning for 5 epochs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WH8rN6dA6V9S" + }, + "outputs": [], + "source": [ + "trainer_finetune.fit(restored_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "lgV0s8auJpxV" + }, + "source": [ + "### Evaluation on the Test set\n", + "\n", + "Lets compute the final score on the test set via `trainer.test(model)`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "szpLp6XTDPaK" + }, + "outputs": [], + "source": [ + "trainer_finetune.test(restored_model, ckpt_path=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uNBAaf1FKcAZ" + }, + "source": [ + "## Advanced Usage: Exporting a model in its entirety\n", + "\n", + "While most models can be easily serialized via the Experiment Manager as a PyTorch Lightning checkpoint, there are certain models where this is insufficient. \n", + "\n", + "Consider the case where a Model contains artifacts such as tokenizers or other intermediate file objects that cannot be so easily serialized into a checkpoint.\n", + "\n", + "For such cases, NeMo offers two utility functions that enable serialization of a Model + artifacts - `save_to` and `restore_from`.\n", + "\n", + "Further documentation regarding these methods can be obtained from the documentation pages on NeMo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Dov9g2j8Lyjs" + }, + "outputs": [], + "source": [ + "import tarfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WNixPPFNJyNc" + }, + "outputs": [], + "source": [ + "# Save a model as a tarfile\n", + "restored_model.save_to(os.path.join(exp_dir_finetune, \"model.nemo\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "B2RHYNjjLrcW" + }, + "outputs": [], + "source": [ + "# The above object is just a tarfile which can store additional artifacts.\n", + "with tarfile.open(os.path.join(exp_dir_finetune, 'model.nemo')) as blob:\n", + " for item in blob:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fRo04x3TLxdu" + }, + "outputs": [], + "source": [ + "# Restore a model from a tarfile\n", + "restored_model_2 = nemo_asr.models.EncDecClassificationModel.restore_from(os.path.join(exp_dir_finetune, \"model.nemo\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LyIegk2CPNsI" + }, + "source": [ + "## Conclusion\n", + "Once the model has been restored, either via a PyTorch Lightning checkpoint or via the `restore_from` methods, one can finetune by following the above general steps." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "02_Speech_Commands.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tutorials/asr/05_Online_Noise_Augmentation.ipynb b/tutorials/asr/05_Online_Noise_Augmentation.ipynb index af5df7bd1b57..b34acacb5fcc 100644 --- a/tutorials/asr/05_Online_Noise_Augmentation.ipynb +++ b/tutorials/asr/05_Online_Noise_Augmentation.ipynb @@ -1,1495 +1,1235 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "name": "05_Online_Noise_Augmentation.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "cvXwyS263AMk", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "57646cdd-58c1-4ddb-8805-9178cb0a2048" - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell.\n", - "\n", - "## Install dependencies\n", - "!pip install wget\n", - "!apt-get install sox libsndfile1 ffmpeg\n", - "!pip install unidecode\n", - "\n", - "# ## Install NeMo\n", - "!python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@candidate#egg=nemo_toolkit[asr]\n", - "\n", - "## Install TorchAudio\n", - "!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html\n", - "\n", - "## Grab the config we'll use in this example\n", - "!mkdir configs" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting wget\n", - " Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=893371ae1eff01dbe22f19e0a9c15f2d31c38d0e7fc31a0565a3538332104d4b\n", - " Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Reading package lists... Done\n", - "Building dependency tree \n", - "Reading state information... Done\n", - "libsndfile1 is already the newest version (1.0.28-4ubuntu0.18.04.1).\n", - "ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).\n", - "The following package was automatically installed and is no longer required:\n", - " libnvidia-common-440\n", - "Use 'apt autoremove' to remove it.\n", - "The following additional packages will be installed:\n", - " libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa\n", - " libsox-fmt-base libsox3\n", - "Suggested packages:\n", - " file libsox-fmt-all\n", - "The following NEW packages will be installed:\n", - " libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa\n", - " libsox-fmt-base libsox3 sox\n", - "0 upgraded, 8 newly installed, 0 to remove and 35 not upgraded.\n", - "Need to get 760 kB of archives.\n", - "After this operation, 6,717 kB of additional disk space will be used.\n", - "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopencore-amrnb0 amd64 0.1.3-2.1 [92.0 kB]\n", - "Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopencore-amrwb0 amd64 0.1.3-2.1 [45.8 kB]\n", - "Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libmagic-mgc amd64 1:5.32-2ubuntu0.4 [184 kB]\n", - "Get:4 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libmagic1 amd64 1:5.32-2ubuntu0.4 [68.6 kB]\n", - "Get:5 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 libsox3 amd64 14.4.2-3ubuntu0.18.04.1 [226 kB]\n", - "Get:6 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 libsox-fmt-alsa amd64 14.4.2-3ubuntu0.18.04.1 [10.6 kB]\n", - "Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 libsox-fmt-base amd64 14.4.2-3ubuntu0.18.04.1 [32.1 kB]\n", - "Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 sox amd64 14.4.2-3ubuntu0.18.04.1 [101 kB]\n", - "Fetched 760 kB in 0s (6,754 kB/s)\n", - "Selecting previously unselected package libopencore-amrnb0:amd64.\n", - "(Reading database ... 144487 files and directories currently installed.)\n", - "Preparing to unpack .../0-libopencore-amrnb0_0.1.3-2.1_amd64.deb ...\n", - "Unpacking libopencore-amrnb0:amd64 (0.1.3-2.1) ...\n", - "Selecting previously unselected package libopencore-amrwb0:amd64.\n", - "Preparing to unpack .../1-libopencore-amrwb0_0.1.3-2.1_amd64.deb ...\n", - "Unpacking libopencore-amrwb0:amd64 (0.1.3-2.1) ...\n", - "Selecting previously unselected package libmagic-mgc.\n", - "Preparing to unpack .../2-libmagic-mgc_1%3a5.32-2ubuntu0.4_amd64.deb ...\n", - "Unpacking libmagic-mgc (1:5.32-2ubuntu0.4) ...\n", - "Selecting previously unselected package libmagic1:amd64.\n", - "Preparing to unpack .../3-libmagic1_1%3a5.32-2ubuntu0.4_amd64.deb ...\n", - "Unpacking libmagic1:amd64 (1:5.32-2ubuntu0.4) ...\n", - "Selecting previously unselected package libsox3:amd64.\n", - "Preparing to unpack .../4-libsox3_14.4.2-3ubuntu0.18.04.1_amd64.deb ...\n", - "Unpacking libsox3:amd64 (14.4.2-3ubuntu0.18.04.1) ...\n", - "Selecting previously unselected package libsox-fmt-alsa:amd64.\n", - "Preparing to unpack .../5-libsox-fmt-alsa_14.4.2-3ubuntu0.18.04.1_amd64.deb ...\n", - "Unpacking libsox-fmt-alsa:amd64 (14.4.2-3ubuntu0.18.04.1) ...\n", - "Selecting previously unselected package libsox-fmt-base:amd64.\n", - "Preparing to unpack .../6-libsox-fmt-base_14.4.2-3ubuntu0.18.04.1_amd64.deb ...\n", - "Unpacking libsox-fmt-base:amd64 (14.4.2-3ubuntu0.18.04.1) ...\n", - "Selecting previously unselected package sox.\n", - "Preparing to unpack .../7-sox_14.4.2-3ubuntu0.18.04.1_amd64.deb ...\n", - "Unpacking sox (14.4.2-3ubuntu0.18.04.1) ...\n", - "Setting up libmagic-mgc (1:5.32-2ubuntu0.4) ...\n", - "Setting up libmagic1:amd64 (1:5.32-2ubuntu0.4) ...\n", - "Setting up libopencore-amrnb0:amd64 (0.1.3-2.1) ...\n", - "Setting up libopencore-amrwb0:amd64 (0.1.3-2.1) ...\n", - "Setting up libsox3:amd64 (14.4.2-3ubuntu0.18.04.1) ...\n", - "Setting up libsox-fmt-base:amd64 (14.4.2-3ubuntu0.18.04.1) ...\n", - "Setting up libsox-fmt-alsa:amd64 (14.4.2-3ubuntu0.18.04.1) ...\n", - "Setting up sox (14.4.2-3ubuntu0.18.04.1) ...\n", - "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n", - "/sbin/ldconfig.real: /usr/local/lib/python3.6/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link\n", - "\n", - "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", - "Processing triggers for mime-support (3.60ubuntu1) ...\n", - "Collecting unidecode\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n", - "\u001b[K |████████████████████████████████| 245kB 8.6MB/s \n", - "\u001b[?25hInstalling collected packages: unidecode\n", - "Successfully installed unidecode-1.1.1\n", - "Collecting nemo_toolkit[asr]\n", - " Cloning https://github.com/NVIDIA/NeMo.git (to revision candidate) to /tmp/pip-install-o476itj1/nemo-toolkit\n", - " Running command git clone -q https://github.com/NVIDIA/NeMo.git /tmp/pip-install-o476itj1/nemo-toolkit\n", - " Running command git checkout -b candidate --track origin/candidate\n", - " Switched to a new branch 'candidate'\n", - " Branch 'candidate' set up to track remote branch 'candidate' from 'origin'.\n", - "Requirement already satisfied, skipping upgrade: numpy>=1.18.2 in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (1.18.5)\n", - "Collecting onnx>=1.7.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/36/ee/bc7bc88fc8449266add978627e90c363069211584b937fd867b0ccc59f09/onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4MB)\n", - "\u001b[K |████████████████████████████████| 7.4MB 8.4MB/s \n", - "\u001b[?25hCollecting pytorch-lightning>=0.8.5\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/32/64/65a5bd6b0c286217f2e53bb067c4099c0584a8eff1d229046b9a35ae3e26/pytorch_lightning-0.8.5-py3-none-any.whl (313kB)\n", - "\u001b[K |████████████████████████████████| 317kB 52.1MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (2.8.1)\n", - "Requirement already satisfied, skipping upgrade: torch in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (1.6.0+cu101)\n", - "Requirement already satisfied, skipping upgrade: wget in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (3.2)\n", - "Requirement already satisfied, skipping upgrade: wrapt in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (1.12.1)\n", - "Collecting ruamel.yaml\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a6/92/59af3e38227b9cc14520bf1e59516d99ceca53e3b8448094248171e9432b/ruamel.yaml-0.16.10-py2.py3-none-any.whl (111kB)\n", - "\u001b[K |████████████████████████████████| 112kB 52.6MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: scikit-learn in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (0.22.2.post1)\n", - "Collecting omegaconf==2.0.1rc11\n", - " Downloading https://files.pythonhosted.org/packages/ab/20/829ef7562493e6791bbeb6919e4e67a6ccc68a6b8cf89f956e659d16c912/omegaconf-2.0.1rc11-py3-none-any.whl\n", - "Collecting hydra-core==1.0.0rc3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/20/54/69730c833cda44f7b3e691f8a17773a6d4633fe7b3305eb64f3e7b94d68c/hydra_core-1.0.0rc3-py3-none-any.whl (117kB)\n", - "\u001b[K |████████████████████████████████| 122kB 56.2MB/s \n", - "\u001b[?25hCollecting transformers>=2.11.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)\n", - "\u001b[K |████████████████████████████████| 778kB 51.5MB/s \n", - "\u001b[?25hCollecting braceexpand\n", - " Downloading https://files.pythonhosted.org/packages/2d/c7/036c23bddf64033b9978803cb9200291b9d3ac87845ba953067bd29d4aa1/braceexpand-0.1.5-py2.py3-none-any.whl\n", - "Requirement already satisfied, skipping upgrade: editdistance in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (0.5.3)\n", - "Collecting frozendict\n", - " Downloading https://files.pythonhosted.org/packages/4e/55/a12ded2c426a4d2bee73f88304c9c08ebbdbadb82569ebdd6a0c007cfd08/frozendict-1.2.tar.gz\n", - "Requirement already satisfied, skipping upgrade: inflect in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (2.1.0)\n", - "Collecting kaldi-io\n", - " Downloading https://files.pythonhosted.org/packages/66/25/e77b445aed94e41bac6369f448d3a0b60e166e4918a130af2cea19a2e187/kaldi_io-0.9.4-py3-none-any.whl\n", - "Requirement already satisfied, skipping upgrade: librosa in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (0.6.3)\n", - "Collecting marshmallow\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a6/11/f824483fcfc4164d733c0a242cc01f7e3b153561500a6a69bba217c9f0f6/marshmallow-3.7.1-py2.py3-none-any.whl (45kB)\n", - "\u001b[K |████████████████████████████████| 51kB 7.6MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: packaging in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (20.4)\n", - "Collecting num2words\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/eb/a2/ea800689730732e27711c41beed4b2a129b34974435bdc450377ec407738/num2words-0.5.10-py3-none-any.whl (101kB)\n", - "\u001b[K |████████████████████████████████| 102kB 13.7MB/s \n", - "\u001b[?25hCollecting soundfile\n", - " Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl\n", - "Collecting sox\n", - " Downloading https://files.pythonhosted.org/packages/bd/63/a535899ba1b45ad48ebd580c4226aef724ce6870cbecc572152ea286a108/sox-1.4.0-py2.py3-none-any.whl\n", - "Collecting torch-stft\n", - " Downloading https://files.pythonhosted.org/packages/9a/53/8a0114930b53459bdc6b090515636bbba7e080905284fb83c995a29eb709/torch_stft-0.1.4-py3-none-any.whl\n", - "Requirement already satisfied, skipping upgrade: unidecode in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (1.1.1)\n", - "Collecting webdataset\n", - " Downloading https://files.pythonhosted.org/packages/97/66/07e1f1be8fd172279936c38b9720593227e5297680f29795fe5a387e77d9/webdataset-0.1.37-py3-none-any.whl\n", - "Collecting kaldi-python-io\n", - " Downloading https://files.pythonhosted.org/packages/6c/38/223c67f0cf926f5bf15a417121defd60d7973f7461cb39f2b26cd7a4b682/kaldi-python-io-1.1.2.tar.gz\n", - "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (1.4.1)\n", - "Requirement already satisfied, skipping upgrade: pandas in /usr/local/lib/python3.6/dist-packages (from nemo_toolkit[asr]) (1.0.5)\n", - "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in /usr/local/lib/python3.6/dist-packages (from onnx>=1.7.0->nemo_toolkit[asr]) (3.7.4.2)\n", - "Requirement already satisfied, skipping upgrade: protobuf in /usr/local/lib/python3.6/dist-packages (from onnx>=1.7.0->nemo_toolkit[asr]) (3.12.4)\n", - "Requirement already satisfied, skipping upgrade: six in /usr/local/lib/python3.6/dist-packages (from onnx>=1.7.0->nemo_toolkit[asr]) (1.15.0)\n", - "Collecting PyYAML>=5.1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n", - "\u001b[K |████████████████████████████████| 276kB 38.3MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: tqdm>=4.41.0 in /usr/local/lib/python3.6/dist-packages (from pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (4.41.1)\n", - "Collecting future>=0.17.1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)\n", - "\u001b[K |████████████████████████████████| 829kB 48.2MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: tensorboard>=1.14 in /usr/local/lib/python3.6/dist-packages (from pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (2.3.0)\n", - "Collecting ruamel.yaml.clib>=0.1.2; platform_python_implementation == \"CPython\" and python_version < \"3.9\"\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/53/77/4bcd63f362bcb6c8f4f06253c11f9772f64189bf08cf3f40c5ccbda9e561/ruamel.yaml.clib-0.2.0-cp36-cp36m-manylinux1_x86_64.whl (548kB)\n", - "\u001b[K |████████████████████████████████| 552kB 52.8MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->nemo_toolkit[asr]) (0.16.0)\n", - "Requirement already satisfied, skipping upgrade: dataclasses; python_version == \"3.6\" in /usr/local/lib/python3.6/dist-packages (from omegaconf==2.0.1rc11->nemo_toolkit[asr]) (0.7)\n", - "Collecting antlr4-python3-runtime==4.8\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", - "\u001b[K |████████████████████████████████| 112kB 54.4MB/s \n", - "\u001b[?25hCollecting importlib-resources; python_version < \"3.9\"\n", - " Downloading https://files.pythonhosted.org/packages/ba/03/0f9595c0c2ef12590877f3c47e5f579759ce5caf817f8256d5dcbd8a1177/importlib_resources-3.0.0-py2.py3-none-any.whl\n", - "Collecting sentencepiece!=0.1.92\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n", - "\u001b[K |████████████████████████████████| 1.1MB 49.1MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: filelock in /usr/local/lib/python3.6/dist-packages (from transformers>=2.11.0->nemo_toolkit[asr]) (3.0.12)\n", - "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from transformers>=2.11.0->nemo_toolkit[asr]) (2.23.0)\n", - "Requirement already satisfied, skipping upgrade: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers>=2.11.0->nemo_toolkit[asr]) (2019.12.20)\n", - "Collecting sacremoses\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", - "\u001b[K |████████████████████████████████| 890kB 49.2MB/s \n", - "\u001b[?25hCollecting tokenizers==0.8.1.rc1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n", - "\u001b[K |████████████████████████████████| 3.0MB 48.5MB/s \n", - "\u001b[?25hRequirement already satisfied, skipping upgrade: audioread>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa->nemo_toolkit[asr]) (2.1.8)\n", - "Requirement already satisfied, skipping upgrade: decorator>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa->nemo_toolkit[asr]) (4.4.2)\n", - "Requirement already satisfied, skipping upgrade: numba>=0.38.0 in /usr/local/lib/python3.6/dist-packages (from librosa->nemo_toolkit[asr]) (0.48.0)\n", - "Requirement already satisfied, skipping upgrade: resampy>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from librosa->nemo_toolkit[asr]) (0.2.2)\n", - "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->nemo_toolkit[asr]) (2.4.7)\n", - "Requirement already satisfied, skipping upgrade: docopt>=0.6.2 in /usr/local/lib/python3.6/dist-packages (from num2words->nemo_toolkit[asr]) (0.6.2)\n", - "Requirement already satisfied, skipping upgrade: cffi>=1.0 in /usr/local/lib/python3.6/dist-packages (from soundfile->nemo_toolkit[asr]) (1.14.1)\n", - "Collecting simplejson\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/73/96/1e6b19045375890068d7342cbe280dd64ae73fd90b9735b5efb8d1e044a1/simplejson-3.17.2-cp36-cp36m-manylinux2010_x86_64.whl (127kB)\n", - "\u001b[K |████████████████████████████████| 133kB 56.4MB/s \n", - "\u001b[?25hCollecting objectio\n", - " Downloading https://files.pythonhosted.org/packages/86/e3/a132a91c4e9fd5e59c947263c7ef4e3415640fa151344f858e2def8c1726/objectio-0.2.29-py3-none-any.whl\n", - "Requirement already satisfied, skipping upgrade: Pillow in /usr/local/lib/python3.6/dist-packages (from webdataset->nemo_toolkit[asr]) (7.0.0)\n", - "Requirement already satisfied, skipping upgrade: msgpack in /usr/local/lib/python3.6/dist-packages (from webdataset->nemo_toolkit[asr]) (1.0.0)\n", - "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->nemo_toolkit[asr]) (2018.9)\n", - "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf->onnx>=1.7.0->nemo_toolkit[asr]) (49.2.0)\n", - "Requirement already satisfied, skipping upgrade: grpcio>=1.24.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (1.31.0)\n", - "Requirement already satisfied, skipping upgrade: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (1.0.1)\n", - "Requirement already satisfied, skipping upgrade: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (3.2.2)\n", - "Requirement already satisfied, skipping upgrade: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (1.17.2)\n", - "Requirement already satisfied, skipping upgrade: absl-py>=0.4 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (0.9.0)\n", - "Requirement already satisfied, skipping upgrade: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (1.7.0)\n", - "Requirement already satisfied, skipping upgrade: wheel>=0.26; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (0.34.2)\n", - "Requirement already satisfied, skipping upgrade: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (0.4.1)\n", - "Requirement already satisfied, skipping upgrade: zipp>=0.4; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core==1.0.0rc3->nemo_toolkit[asr]) (3.1.0)\n", - "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=2.11.0->nemo_toolkit[asr]) (2020.6.20)\n", - "Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=2.11.0->nemo_toolkit[asr]) (3.0.4)\n", - "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=2.11.0->nemo_toolkit[asr]) (1.24.3)\n", - "Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=2.11.0->nemo_toolkit[asr]) (2.10)\n", - "Requirement already satisfied, skipping upgrade: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers>=2.11.0->nemo_toolkit[asr]) (7.1.2)\n", - "Requirement already satisfied, skipping upgrade: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba>=0.38.0->librosa->nemo_toolkit[asr]) (0.31.0)\n", - "Requirement already satisfied, skipping upgrade: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi>=1.0->soundfile->nemo_toolkit[asr]) (2.20)\n", - "Collecting typer\n", - " Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl\n", - "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (1.7.0)\n", - "Requirement already satisfied, skipping upgrade: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (4.6)\n", - "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (0.2.8)\n", - "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (4.1.1)\n", - "Requirement already satisfied, skipping upgrade: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (1.3.0)\n", - "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3\"->google-auth<2,>=1.6.3->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (0.4.8)\n", - "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard>=1.14->pytorch-lightning>=0.8.5->nemo_toolkit[asr]) (3.1.0)\n", - "Building wheels for collected packages: nemo-toolkit, frozendict, kaldi-python-io, PyYAML, future, antlr4-python3-runtime, sacremoses\n", - " Building wheel for nemo-toolkit (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for nemo-toolkit: filename=nemo_toolkit-0.88.1b0-cp36-none-any.whl size=353237 sha256=d6e937856d55a703c569a6ab4b49b0581258b992731b8e783b8af9f49ac5d52d\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mi0xwd5s/wheels/f1/17/64/854a3e212e0b89b5fae342a6f004942ccedbe7d459048577ef\n", - " Building wheel for frozendict (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for frozendict: filename=frozendict-1.2-cp36-none-any.whl size=3149 sha256=80c993932d16d0c39991bc620cdcedd8bd67fa8f12ef6898b7ca64d316bbfaa2\n", - " Stored in directory: /root/.cache/pip/wheels/6c/6c/e9/534386165bd12cf1885582c75eb6d0ffcb321b65c23fe0f834\n", - " Building wheel for kaldi-python-io (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for kaldi-python-io: filename=kaldi_python_io-1.1.2-cp36-none-any.whl size=9037 sha256=ab4899d9fc1239dbc5168820c0b13f6c11e15bef4f2be2c45c983a6c955565c4\n", - " Stored in directory: /root/.cache/pip/wheels/2d/ba/33/a7f786be40d21529c200bbeb4fc5d3364f97a2405d33606d35\n", - " Building wheel for PyYAML (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for PyYAML: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44621 sha256=6f3f1aaa2bda03546fe34a0883bbe7ffc9e87a111ad8ebbc3d95d7ac9f66a7a8\n", - " Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n", - " Building wheel for future (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for future: filename=future-0.18.2-cp36-none-any.whl size=491057 sha256=711c6ba719260fbe7efb79fec6341668cef0546459f3f8cefe989ccdf5ea7b46\n", - " Stored in directory: /root/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e\n", - " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp36-none-any.whl size=141231 sha256=89f439c6c0602c61893f806e364f3571abbaa50ad56c7d32bf49413a0f65eb21\n", - " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", - " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893260 sha256=53bfb6d76222c8bf4152f0a8a7bd96ff30cd853118ddd53bd1e34be8858d31c0\n", - " Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", - "Successfully built nemo-toolkit frozendict kaldi-python-io PyYAML future antlr4-python3-runtime sacremoses\n", - "Installing collected packages: onnx, PyYAML, future, pytorch-lightning, ruamel.yaml.clib, ruamel.yaml, omegaconf, antlr4-python3-runtime, importlib-resources, hydra-core, sentencepiece, sacremoses, tokenizers, transformers, braceexpand, frozendict, kaldi-io, marshmallow, num2words, soundfile, sox, torch-stft, simplejson, typer, objectio, webdataset, kaldi-python-io, nemo-toolkit\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - " Found existing installation: future 0.16.0\n", - " Uninstalling future-0.16.0:\n", - " Successfully uninstalled future-0.16.0\n", - "Successfully installed PyYAML-5.3.1 antlr4-python3-runtime-4.8 braceexpand-0.1.5 frozendict-1.2 future-0.18.2 hydra-core-1.0.0rc3 importlib-resources-3.0.0 kaldi-io-0.9.4 kaldi-python-io-1.1.2 marshmallow-3.7.1 nemo-toolkit-0.88.1b0 num2words-0.5.10 objectio-0.2.29 omegaconf-2.0.1rc11 onnx-1.7.0 pytorch-lightning-0.8.5 ruamel.yaml-0.16.10 ruamel.yaml.clib-0.2.0 sacremoses-0.0.43 sentencepiece-0.1.91 simplejson-3.17.2 soundfile-0.10.3.post1 sox-1.4.0 tokenizers-0.8.1rc1 torch-stft-0.1.4 transformers-3.0.2 typer-0.3.2 webdataset-0.1.37\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Kqg4Rwki4jBX", - "colab_type": "text" - }, - "source": [ - "# Introduction\n", - "\n", - "Data augmentation is a useful method to improve the performance of models which is applicable across multiple domains. Certain augmentations can also substantially improve robustness of models to noisy samples. \n", - "\n", - "In this notebook, we describe how to construct an augmentation pipeline inside [Neural Modules (NeMo)](https://github.com/NVIDIA/NeMo), enable augmented training of a [MatchboxNet model](https://arxiv.org/abs/2004.08531 ) ( based on QuartzNet, from the paper [\"QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions\"](https://arxiv.org/abs/1910.10261)) and finally how to construct custom augmentations to add to NeMo.\n", - "\n", - "The notebook will follow the steps below:\n", - "\n", - " - Dataset preparation: Preparing a noise dataset using an example file.\n", - "\n", - " - Construct a data augmentation pipeline.\n", - " \n", - " - Construct a custom augmentation and register it for use in NeMo." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5XieMEo84pJ-", - "colab_type": "text" - }, - "source": [ - "## Note\n", - "Data augmentation is valuable for many datasets, but it comes at the cost of increased training time if samples are augmented during training time. Certain augmentations are particularly costly, in terms of how much time they take to process a single sample. A few examples of slow augmentations available in NeMo are : \n", - "\n", - " - Speed Perturbation\n", - " - Time Stretch Perturbation (Sample level)\n", - " - Noise Perturbation\n", - " - Impulse Perturbation\n", - " - Time Stretch Augmentation (Batch level, Neural Module)\n", - " \n", - "For such augmentations, it is advisable to pre-process the dataset offline for a one time preprocessing cost and then train the dataset on this augmented training set." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Tgc_ZHDl4sMy", - "colab_type": "text" - }, - "source": [ - "## Taking a Look at Our Data (AN4)\n", - "\n", - "The AN4 dataset, also known as the Alphanumeric dataset, was collected and published by Carnegie Mellon University. It consists of recordings of people spelling out addresses, names, telephone numbers, etc., one letter or number at a time, as well as their corresponding transcripts. We choose to use AN4 for this tutorial because it is relatively small, with 948 training and 130 test utterances, and so it trains quickly.\n", - "\n", - "Before we get started, let's download and prepare the dataset. The utterances are available as `.sph` files, so we will need to convert them to `.wav` for later processing. Please make sure you have [Sox](http://sox.sourceforge.net/) installed for this step (see the \"Downloads\" section of the main page)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DtLm_XuQ3pmk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# This is where the an4/ directory will be placed.\n", - "# Change this if you don't want the data to be extracted in the current directory.\n", - "data_dir = '.'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "HjfLhUtH4wNc", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - }, - "outputId": "f0a9cd46-6709-49dd-9103-1e0ef61de745" - }, - "source": [ - "import glob\n", - "import os\n", - "import subprocess\n", - "import tarfile\n", - "import wget\n", - "\n", - "# Download the dataset. This will take a few moments...\n", - "print(\"******\")\n", - "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n", - " an4_path = wget.download(an4_url, data_dir)\n", - " print(f\"Dataset downloaded at: {an4_path}\")\n", - "else:\n", - " print(\"Tarfile already exists.\")\n", - " an4_path = data_dir + '/an4_sphere.tar.gz'\n", - "\n", - "if not os.path.exists(data_dir + '/an4/'):\n", - " # Untar and convert .sph to .wav (using sox)\n", - " tar = tarfile.open(an4_path)\n", - " tar.extractall(path=data_dir)\n", - "\n", - " print(\"Converting .sph to .wav...\")\n", - " sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)\n", - " for sph_path in sph_list:\n", - " wav_path = sph_path[:-4] + '.wav'\n", - " cmd = [\"sox\", sph_path, wav_path]\n", - " subprocess.run(cmd)\n", - "print(\"Finished conversion.\\n******\")\n" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "******\n", - "Dataset downloaded at: ./an4_sphere.tar.gz\n", - "Converting .sph to .wav...\n", - "Finished conversion.\n", - "******\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HqJmf4WB5P1x", - "colab_type": "text" - }, - "source": [ - "You should now have a folder called `an4` that contains `etc/an4_train.transcription`, `etc/an4_test.transcription`, audio files in `wav/an4_clstk` and `wav/an4test_clstk`, along with some other files we will not need.\n", - "\n", - "We now build a few manifest files which will be used later:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AmR6CH025C8E", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - }, - "outputId": "0cd776ea-078f-4ab8-8a79-eed3e1c05839" - }, - "source": [ - "# --- Building Manifest Files --- #\n", - "import json\n", - "import librosa\n", - "\n", - "# Function to build a manifest\n", - "def build_manifest(transcripts_path, manifest_path, wav_path):\n", - " with open(transcripts_path, 'r') as fin:\n", - " with open(manifest_path, 'w') as fout:\n", - " for line in fin:\n", - " # Lines look like this:\n", - " # transcript (fileID)\n", - " transcript = line[: line.find('(')-1].lower()\n", - " transcript = transcript.replace('', '').replace('', '')\n", - " transcript = transcript.strip()\n", - "\n", - " file_id = line[line.find('(')+1 : -2] # e.g. \"cen4-fash-b\"\n", - " audio_path = os.path.join(\n", - " data_dir, wav_path,\n", - " file_id[file_id.find('-')+1 : file_id.rfind('-')],\n", - " file_id + '.wav')\n", - "\n", - " duration = librosa.core.get_duration(filename=audio_path)\n", - "\n", - " # Write the metadata to the manifest\n", - " metadata = {\n", - " \"audio_filepath\": audio_path,\n", - " \"duration\": duration,\n", - " \"text\": transcript\n", - " }\n", - " json.dump(metadata, fout)\n", - " fout.write('\\n')\n", - " \n", - "# Building Manifests\n", - "print(\"******\")\n", - "train_transcripts = data_dir + '/an4/etc/an4_train.transcription'\n", - "train_manifest = data_dir + '/an4/train_manifest.json'\n", - "build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')\n", - "print(\"Training manifest created.\")\n", - "\n", - "test_transcripts = data_dir + '/an4/etc/an4_test.transcription'\n", - "test_manifest = data_dir + '/an4/test_manifest.json'\n", - "build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n", - "print(\"Test manifest created.\")\n", - "print(\"******\")" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "******\n", - "Training manifest created.\n", - "Test manifest created.\n", - "******\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EQsXzh7x5zIQ", - "colab_type": "text" - }, - "source": [ - "## Prepare the path to manifest files" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "vmOa0IRC5eW4", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dataset_basedir = os.path.join(data_dir, 'an4')\n", - "\n", - "train_dataset = os.path.join(dataset_basedir, 'train_manifest.json')\n", - "test_dataset = os.path.join(dataset_basedir, 'test_manifest.json')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pz9LC3yZ6J1Q", - "colab_type": "text" - }, - "source": [ - "## Read a few rows of the manifest file \n", - "\n", - "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", - "\n", - "1) `audio_filepath`: Refers to the path to the raw audio file
\n", - "2) `text`: The text transcript of this sample
\n", - "3) `duration`: The length of the audio file, in seconds." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3OzZQiX751iz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!head -n 5 {train_dataset}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pD9bprV66Oai", - "colab_type": "text" - }, - "source": [ - "# Data Augmentation Pipeline\n", - "\n", - "Constructing a data augmentation pipeline in NeMo is as simple as composing a nested dictionary that describes two things :\n", - "\n", - "1) The probability of that augmentation occuring - using the `prob` keyword
\n", - "2) The keyword arguments required by that augmentation class\n", - "\n", - "Below, we show a few samples of these augmentations. Note, in order to distinguish between the original sample and the perturbed sample, we exaggerate the perturbation strength significantly." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "l5bc7gYO6MHG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import torch\n", - "import IPython.display as ipd" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L8Bd8s3e6TeK", - "colab_type": "text" - }, - "source": [ - "## Audio file preparation " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "g7f9riZz6Qnj", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Import the data augmentation component from ASR collection\n", - "from nemo.collections.asr.parts import perturb, segment" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "wK8uwpt16d6I", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets see the available perturbations\n", - "perturb.perturbation_types" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IP1VpkOA6nE-", - "colab_type": "text" - }, - "source": [ - "### Obtain a baseline audio file" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "sj4DNMmZ6ktm", - "colab_type": "code", - "colab": {} - }, - "source": [ - "filepath = librosa.util.example_audio_file()\n", - "sample, sr = librosa.core.load(filepath)\n", - "\n", - "ipd.Audio(sample, rate=sr)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "M9mZNm296tNf", - "colab_type": "text" - }, - "source": [ - "### Convert to WAV format" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QDjlgLc-6vtq", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import soundfile as sf\n", - "\n", - "# lets convert this ogg file into a wave to be compatible with NeMo\n", - "if not os.path.exists('./media'):\n", - " os.makedirs('./media/')\n", - " \n", - "filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", - "filepath = os.path.join('media', filename)\n", - "\n", - "sf.write(filepath, sample, samplerate=sr)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "FEkV-ikT6xgB", - "colab_type": "code", - "colab": {} - }, - "source": [ - "sample, sr = librosa.core.load(filepath)\n", - "ipd.Audio(sample, rate=sr)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "gmuwEwIQ6zK3", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# NeMo has its own support class for loading wav files\n", - "def load_audio() -> segment.AudioSegment:\n", - " filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", - " filepath = os.path.join('media', filename)\n", - " sample_segment = segment.AudioSegment.from_file(filepath, target_sr=sr)\n", - " return sample_segment\n", - "\n", - "sample_segment = load_audio()\n", - "ipd.Audio(sample_segment.samples, rate=sr)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hTnf1g1y63wZ", - "colab_type": "text" - }, - "source": [ - "## White Noise Perturbation\n", - "\n", - "White Noise perturbation is performed by the following steps :
\n", - "1) Randomly sample the amplitude of the noise from a uniformly distributed range (defined in dB)
\n", - "2) Sample gaussian noise (mean = 0, std = 1) with same length as audio signal
\n", - "3) Scale this gaussian noise by the amplitude (in dB scale)
\n", - "4) Add this noise vector to the original sample\n", - "\n", - "Notably, the original signal should not have a \"hissing sound\" constantly present in the perturbed version." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2jaPQyUY65ij", - "colab_type": "code", - "colab": {} - }, - "source": [ - "white_noise = perturb.WhiteNoisePerturbation(min_level=-50, max_level=-30)\n", - "\n", - "# Perturb the audio file\n", - "sample_segment = load_audio()\n", - "white_noise.perturb(sample_segment)\n", - "\n", - "ipd.Audio(sample_segment.samples, rate=sr)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2dfwesJU7DhN", - "colab_type": "text" - }, - "source": [ - "## Shift Perturbation\n", - "\n", - "Shift perturbation is performed by the following steps :
\n", - "1) Randomly sample the shift factor of the signal from a uniformly distributed range (defined in milliseconds)
\n", - "2) Depending on the sign of the shift, we shift the original signal to the left or the right.
\n", - "3) The boundary locations are filled with zeros after the shift of the signal
\n", - "\n", - "Notably, the perturbed signal below skips the first 25 to 50 seconds of the original audio below, and the remainder of the time is simply silence. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2ONq8dBI7BZf", - "colab_type": "code", - "colab": {} - }, - "source": [ - "shift = perturb.ShiftPerturbation(min_shift_ms=25000.0, max_shift_ms=50000.0)\n", - "\n", - "# Perturb the audio file \n", - "sample_segment = load_audio()\n", - "shift.perturb(sample_segment)\n", - "\n", - "ipd.Audio(sample_segment.samples, rate=sr)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kywA3h4T7G_S", - "colab_type": "text" - }, - "source": [ - "## Data Dependent Perturbations\n", - "\n", - "Some perturbations require an external data source in order to perturb the original sample. Noise Perturbation is a perfect example of one such augmentation that requires an external noise source dataset in order to pertur the original data." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eYm2DgGQ7KPe", - "colab_type": "text" - }, - "source": [ - "### Preparing a manifest of \"noise\" samples" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "RXZ1o85E7FLT", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets prepare a manifest file using the baseline file itself, cut into 1 second segments\n", - "\n", - "def write_manifest(filepath, data_dir='./media/', manifest_name='noise_manifest', duration_max=None, duration_stride=1.0, filter_long=False, duration_limit=10.0):\n", - " if duration_max is None:\n", - " duration_max = 1e9\n", - " \n", - " with open(os.path.join(data_dir, manifest_name + '.json'), 'w') as fout:\n", - " \n", - " try:\n", - " x, _sr = librosa.load(filepath)\n", - " duration = librosa.get_duration(x, sr=_sr)\n", - "\n", - " except Exception:\n", - " print(f\"\\n>>>>>>>>> WARNING: Librosa failed to load file {filepath}. Skipping this file !\\n\")\n", - " return\n", - "\n", - " if filter_long and duration > duration_limit:\n", - " print(f\"Skipping sound sample {filepath}, exceeds duration limit of {duration_limit}\")\n", - " return\n", - "\n", - " offsets = []\n", - " durations = []\n", - "\n", - " if duration > duration_max:\n", - " current_offset = 0.0\n", - "\n", - " while current_offset < duration:\n", - " difference = duration - current_offset\n", - " segment_duration = min(duration_max, difference)\n", - "\n", - " offsets.append(current_offset)\n", - " durations.append(segment_duration)\n", - "\n", - " current_offset += duration_stride\n", - "\n", - " else:\n", - " offsets.append(0.0)\n", - " durations.append(duration)\n", - "\n", - "\n", - " for duration, offset in zip(durations, offsets):\n", - " metadata = {\n", - " 'audio_filepath': filepath,\n", - " 'duration': duration,\n", - " 'label': 'noise',\n", - " 'text': '_', # for compatibility with ASRAudioText collection\n", - " 'offset': offset,\n", - " }\n", - "\n", - " json.dump(metadata, fout)\n", - " fout.write('\\n')\n", - " fout.flush()\n", - "\n", - " print(f\"Wrote {len(durations)} segments for filename {filename}\")\n", - " \n", - " print(\"Finished preparing manifest !\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "wLTT8jlP7NdU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", - "filepath = os.path.join('media', filename)\n", - "\n", - "# Write a \"noise\" manifest file\n", - "write_manifest(filepath, manifest_name='noise_1s', duration_max=1.0, duration_stride=1.0)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "izbdrSmd7PY5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets read this noise manifest file\n", - "noise_manifest_path = os.path.join('media', 'noise_1s.json')\n", - "\n", - "!head -n 5 {noise_manifest_path}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "82yq0TOV7Q_4", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets create a helper method to load the first file in the train dataset of AN4\n", - "# Load the first sample in the manifest\n", - "def load_gsc_sample() -> segment.AudioSegment:\n", - " with open(train_dataset, 'r') as f:\n", - " line = f.readline()\n", - " \n", - " line = json.loads(line)\n", - " gsc_filepath = line['audio_filepath']\n", - " sample_segment = segment.AudioSegment.from_file(gsc_filepath)\n", - " return sample_segment\n", - "\n", - "gsc_sample_segment = load_gsc_sample()\n", - "ipd.Audio(gsc_sample_segment.samples, rate=16000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zV9ypBqz7V9a", - "colab_type": "text" - }, - "source": [ - "## Noise Augmentation\n", - "\n", - "Noise perturbation is performed by the following steps :
\n", - "1) Randomly sample the amplitude scale of the noise sample from a uniformly distributed range (defined in dB)
\n", - "2) Randomly choose an audio clip from the set of noise audio samples available
\n", - "3) Compute the gain (in dB) required for the noise clip as compared to the original sample and scale the noise by this factor
\n", - "4) If the noise snippet is of shorter duration than the original audio, then randomly select an index in time from the original sample, where the noise snippet will be added
\n", - "5) If instead the noise snippet is longer than the duration of the original audio, then randomly subsegment the noise snippet and add the full snippet to the original audio
\n", - "\n", - "Notably, the noise perturbed sample should sound as if there are two sounds playing at the same time (overlapping audio) as compared to the original signal. The magnitude of the noise will be dependent on step (3) and the location where the noise is added will depend on steps (4) and (5)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "cjSXci1v7Tlg", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import random\n", - "rng = random.Random(0)\n", - "noise = perturb.NoisePerturbation(manifest_path=noise_manifest_path,\n", - " min_snr_db=-10, max_snr_db=-10,\n", - " max_gain_db=300.0, rng=rng)\n", - "\n", - "# Perturb the audio file \n", - "sample_segment = load_gsc_sample()\n", - "noise.perturb(sample_segment)\n", - "\n", - "ipd.Audio(sample_segment.samples, rate=16000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kJjUkGJu7ern", - "colab_type": "text" - }, - "source": [ - "## Speed Perturbation\n", - "\n", - "Speed perturbation changes the speed of the speech, but does not preserve pitch of the sound. Try a few random augmentations to see how the pitch changes with change in duration of the audio file.\n", - "\n", - "**Note**: This is a very slow augmentation and is not advised to perform online augmentation for large datasets as it can dramatically increase training time." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ic-ziInU7ZKC", - "colab_type": "code", - "colab": {} - }, - "source": [ - "resample_type = 'kaiser_best' # Can be ['kaiser_best', 'kaiser_fast', 'fft', 'scipy']\n", - "speed = perturb.SpeedPerturbation(sr, resample_type, min_speed_rate=0.5, max_speed_rate=2.0, num_rates=-1)\n", - "\n", - "# Perturb the audio file \n", - "sample_segment = load_gsc_sample()\n", - "speed.perturb(sample_segment)\n", - "\n", - "ipd.Audio(sample_segment.samples, rate=16000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bhHX3dyh7jPq", - "colab_type": "text" - }, - "source": [ - "## Time Stretch Perturbation\n", - "\n", - "Time Stretch perturbation changes the speed of the speech, and also preserve pitch of the sound. \n", - "Try a few random augmentations to see how the pitch remains close to the same with change in duration of the audio file." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8_kNSfcK7lfP", - "colab_type": "text" - }, - "source": [ - "### Note about speed optimizations\n", - "\n", - "Time stretch is a costly augmentation, and can easily cause training time to increase drastically. It is suggested that one installs the `numba` library using conda to use a more optimized augmentation kernel.\n", - "\n", - "```python\n", - "conda install numba\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Dpeb0QUZ7g3l", - "colab_type": "code", - "colab": {} - }, - "source": [ - "time_stretch = perturb.TimeStretchPerturbation(min_speed_rate=0.5, max_speed_rate=2.0, num_rates=-1)\n", - "\n", - "# Perturb the audio file \n", - "sample_segment = load_gsc_sample()\n", - "time_stretch.perturb(sample_segment)\n", - "\n", - "ipd.Audio(sample_segment.samples, rate=16000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vhH1-Ga87rCX", - "colab_type": "text" - }, - "source": [ - "# Augmentation Pipeline\n", - "\n", - "The augmentation pipeline can be constructed in multiple ways, either explicitly by instantiating the objects of these perturbations or implicitly by providing the arguments to these augmentations as a nested dictionary.\n", - "\n", - "We will show both approaches in the following sections" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RC8_NOD97tlW", - "colab_type": "text" - }, - "source": [ - "## Explicit definition" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UwWE7swo72WP", - "colab_type": "text" - }, - "source": [ - "### Instantiate the perturbations" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GdLYn0hx7pRU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "perturbations = [\n", - " perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46),\n", - " perturb.GainPerturbation(min_gain_dbfs=0, max_gain_dbfs=50),\n", - " perturb.NoisePerturbation(manifest_path=noise_manifest_path,\n", - " min_snr_db=0, max_snr_db=50, max_gain_db=300.0)\n", - "]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CDSSbZ8w7zzR", - "colab_type": "text" - }, - "source": [ - "### Select chance of perturbations being applied" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NmoxfLSL7xPJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "probas = [1.0, 1.0, 0.5]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wl0tnrMq79Jh", - "colab_type": "text" - }, - "source": [ - "### Prepare the audio augmentation object" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "nO6T4U4f767o", - "colab_type": "code", - "colab": {} - }, - "source": [ - "augmentations = list(zip(probas, perturbations))\n", - "\n", - "audio_augmentations = perturb.AudioAugmentor(augmentations)\n", - "audio_augmentations._pipeline" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9cgI9yUx8Cyv", - "colab_type": "text" - }, - "source": [ - "## Implicit definition\n", - "\n", - "Implicit definitions are preferred since they can be prepared in the actual configuration object." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "tiqrKFTM7_mH", - "colab_type": "code", - "colab": {} - }, - "source": [ - "perturb.perturbation_types # Available perturbations" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dbeXwLdw8VEc", - "colab_type": "text" - }, - "source": [ - "### Prepare the nested dictionary" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "mbE0qEA98TRI", - "colab_type": "code", - "colab": {} - }, - "source": [ - "audio_augmentations = dict(\n", - " white_noise = dict(\n", - " prob=1.0,\n", - " min_level=-90,\n", - " max_level=-46\n", - " ),\n", - " gain = dict(\n", - " prob=1.0,\n", - " min_gain_dbfs=0,\n", - " max_gain_dbfs=50\n", - " ),\n", - " noise = dict(\n", - " prob=0.5,\n", - " manifest_path=noise_manifest_path,\n", - " min_snr_db=0,\n", - " max_snr_db=50,\n", - " max_gain_db=300.0\n", - " )\n", - ")\n", - "\n", - "audio_augmentations" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tcsoCe9-8ZM9", - "colab_type": "text" - }, - "source": [ - "### Supply `augmentor` as an argument to the `model.train_ds` config\n", - "\n", - "Most of the common datasets used by ASR models support the keyword `augmentor` - which can include a nested dictionary defining the implicit definition of an augmentation pipeline.\n", - "\n", - "Note, all ASR models support implicit declaration of augmentations. This includes - \n", - "\n", - "1) Speech To Label Models
\n", - "2) Speech To Text Models
\n", - "3) Speech To Text Models with BPE/WPE Support
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0WOJC0fdBL5J", - "colab_type": "text" - }, - "source": [ - "# Training - Application of augmentations\n", - "\n", - "We will be describing the data loaders for a MatchboxNet model from the paper \"[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)\". The benefit of MatchboxNet over JASPER models is that they use Separable Convolutions, which greatly reduce the number of parameters required to get good model accuracy.\n", - "\n", - "Care must be taken not to apply augmentations to the test set!\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7iDWiIrzBzUA", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from omegaconf import OmegaConf" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "yv3KWNjcAUnQ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# We will download the MatchboxNet configuration file for either v1 or v2 dataset here\n", - "DATASET_VER = 1\n", - "\n", - "if DATASET_VER == 1:\n", - " MODEL_CONFIG = \"matchboxnet_3x1x64_v1.yaml\"\n", - "else:\n", - " MODEL_CONFIG = \"matchboxnet_3x1x64_v2.yaml\"\n", - "\n", - "if not os.path.exists(f\"configs/{MODEL_CONFIG}\"):\n", - " !wget -P configs/ \"https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/asr/conf/{MODEL_CONFIG}\"" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "vOcv0ri3BkmA", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# This line will load the entire config of the MatchboxNet model\n", - "config_path = f\"configs/{MODEL_CONFIG}\"\n", - "config = OmegaConf.load(config_path)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mLsyceMSCIHV", - "colab_type": "text" - }, - "source": [ - "### Augmentation in train set only\n", - "\n", - "Note how the train dataset config supports the `augmentor` implicit definition, however the test config does not.\n", - "\n", - "This is essential to avoid mistakenly performing Test Time Augmentation." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "VgUVm7lGB8Cz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Has `augmentor`\n", - "print(config.model.train_ds.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "gURwQ2eyCE7o", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Does not have `augmentor`\n", - "print(config.model.test_ds.pretty())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_UV74AVlCo_m", - "colab_type": "text" - }, - "source": [ - "# Custom Perturbations\n", - "\n", - "We can define and use custom perturbations as required simply by extending the `Perturbation` class. \n", - "\n", - "Lets look at how we can build a custom Noise Perturbation that we can use to evaluate the effect of noise at inference time, in order to analyse the model's robustness to noise\n", - "\n", - "In evaluation mode, we want to set an explicit value for the `snr_db` parameter instead of uniformly sample it from a range. This allows us to control the signal to noise ratio without relying on randomness from the training implementation of `NoisePerturbation`.\n", - "\n", - "Further, we force a random seed in order to produce reproduceable results on the evaluation set.\n", - "\n", - "With this combination, we can easily evaluate each sample in the test set `S` times (`S` being the number of random seeds), and can evaluate each of these samples at `D` levels of Signal to Noise Ratio (in dB). " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Q9YBmBiZCbAX", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# We use a NeMo utility to parse the manifest file for us\n", - "from nemo.collections.asr.parts import collections, parsers\n", - "\n", - "class NoisePerturbationEval(perturb.Perturbation):\n", - " def __init__(\n", - " self, manifest_path=None, snr_db=40, max_gain_db=300.0, seed=None,\n", - " ):\n", - " seed = seed if seed is not None else 0\n", - " self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]))\n", - " self._snr_db = snr_db\n", - " self._max_gain_db = max_gain_db\n", - " self._rng = random.Random(seed)\n", - " \n", - " # This is mostly obtained from the original NoisePerturbation class itself\n", - " def perturb(self, data):\n", - " snr_db = self._snr_db\n", - " noise_record = self._rng.sample(self._manifest.data, 1)[0]\n", - " noise = AudioSegment.from_file(noise_record.audio_file, target_sr=data.sample_rate)\n", - " noise_gain_db = min(data.rms_db - noise.rms_db - snr_db, self._max_gain_db)\n", - "\n", - " # calculate noise segment to use\n", - " start_time = 0.0\n", - " if noise.duration > (start_time + data.duration):\n", - " noise.subsegment(start_time=start_time, end_time=start_time + data.duration)\n", - "\n", - " # adjust gain for snr purposes and superimpose\n", - " noise.gain_db(noise_gain_db)\n", - "\n", - " if noise._samples.shape[0] < data._samples.shape[0]:\n", - " noise_idx = data._samples.shape[0] // 2 # midpoint of audio\n", - " while (noise_idx + noise._samples.shape[0]) > data._samples.shape[0]:\n", - " noise_idx = noise_idx // 2 # half the initial starting point\n", - "\n", - " data._samples[noise_idx: noise_idx + noise._samples.shape[0]] += noise._samples\n", - "\n", - " else:\n", - " data._samples += noise._samples\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qR8qiwSkC1eE", - "colab_type": "text" - }, - "source": [ - "## Registering augmentations\n", - "\n", - "We can use either approach to submit this test time augmentation to the Data Loaders.\n", - "\n", - "In order to obtain the convenience of the implicit method, we must register this augmentation into NeMo's directory of available augmentations. This can be done as follows -" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "40Z4Fm88CxWA", - "colab_type": "code", - "colab": {} - }, - "source": [ - "perturb.register_perturbation(name='noise_eval', perturbation=NoisePerturbationEval)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jVVbRxb-C4hB", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Lets check the registry of allowed perturbations !\n", - "perturb.perturbation_types" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2fiHz6CdC-B1", - "colab_type": "text" - }, - "source": [ - "## Overriding pre-existing augmentations\n", - "\n", - "**Note**: It is not allowed to overwrite already registered perturbations using the `perturb.register_perturbation` method. It will raise a `ValueError` in order to prevent overwriting the pre-existing perturbation types" - ] - } - ] -} \ No newline at end of file + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "cvXwyS263AMk", + "outputId": "57646cdd-58c1-4ddb-8805-9178cb0a2048" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "\n", + "## Install dependencies\n", + "!pip install wget\n", + "!apt-get install sox libsndfile1 ffmpeg\n", + "!pip install unidecode\n", + "\n", + "# ## Install NeMo\n", + "!python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]\n", + "\n", + "## Install TorchAudio\n", + "!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html\n", + "\n", + "## Grab the config we'll use in this example\n", + "!mkdir configs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Kqg4Rwki4jBX" + }, + "source": [ + "# Introduction\n", + "\n", + "Data augmentation is a useful method to improve the performance of models which is applicable across multiple domains. Certain augmentations can also substantially improve robustness of models to noisy samples. \n", + "\n", + "In this notebook, we describe how to construct an augmentation pipeline inside [Neural Modules (NeMo)](https://github.com/NVIDIA/NeMo), enable augmented training of a [MatchboxNet model](https://arxiv.org/abs/2004.08531 ) ( based on QuartzNet, from the paper [\"QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions\"](https://arxiv.org/abs/1910.10261)) and finally how to construct custom augmentations to add to NeMo.\n", + "\n", + "The notebook will follow the steps below:\n", + "\n", + " - Dataset preparation: Preparing a noise dataset using an example file.\n", + "\n", + " - Construct a data augmentation pipeline.\n", + " \n", + " - Construct a custom augmentation and register it for use in NeMo." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5XieMEo84pJ-" + }, + "source": [ + "## Note\n", + "Data augmentation is valuable for many datasets, but it comes at the cost of increased training time if samples are augmented during training time. Certain augmentations are particularly costly, in terms of how much time they take to process a single sample. A few examples of slow augmentations available in NeMo are : \n", + "\n", + " - Speed Perturbation\n", + " - Time Stretch Perturbation (Sample level)\n", + " - Noise Perturbation\n", + " - Impulse Perturbation\n", + " - Time Stretch Augmentation (Batch level, Neural Module)\n", + " \n", + "For such augmentations, it is advisable to pre-process the dataset offline for a one time preprocessing cost and then train the dataset on this augmented training set." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Tgc_ZHDl4sMy" + }, + "source": [ + "## Taking a Look at Our Data (AN4)\n", + "\n", + "The AN4 dataset, also known as the Alphanumeric dataset, was collected and published by Carnegie Mellon University. It consists of recordings of people spelling out addresses, names, telephone numbers, etc., one letter or number at a time, as well as their corresponding transcripts. We choose to use AN4 for this tutorial because it is relatively small, with 948 training and 130 test utterances, and so it trains quickly.\n", + "\n", + "Before we get started, let's download and prepare the dataset. The utterances are available as `.sph` files, so we will need to convert them to `.wav` for later processing. Please make sure you have [Sox](http://sox.sourceforge.net/) installed for this step (see the \"Downloads\" section of the main page)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DtLm_XuQ3pmk" + }, + "outputs": [], + "source": [ + "# This is where the an4/ directory will be placed.\n", + "# Change this if you don't want the data to be extracted in the current directory.\n", + "data_dir = '.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 + }, + "colab_type": "code", + "id": "HjfLhUtH4wNc", + "outputId": "f0a9cd46-6709-49dd-9103-1e0ef61de745" + }, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import subprocess\n", + "import tarfile\n", + "import wget\n", + "\n", + "# Download the dataset. This will take a few moments...\n", + "print(\"******\")\n", + "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", + " an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n", + " an4_path = wget.download(an4_url, data_dir)\n", + " print(f\"Dataset downloaded at: {an4_path}\")\n", + "else:\n", + " print(\"Tarfile already exists.\")\n", + " an4_path = data_dir + '/an4_sphere.tar.gz'\n", + "\n", + "if not os.path.exists(data_dir + '/an4/'):\n", + " # Untar and convert .sph to .wav (using sox)\n", + " tar = tarfile.open(an4_path)\n", + " tar.extractall(path=data_dir)\n", + "\n", + " print(\"Converting .sph to .wav...\")\n", + " sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)\n", + " for sph_path in sph_list:\n", + " wav_path = sph_path[:-4] + '.wav'\n", + " cmd = [\"sox\", sph_path, wav_path]\n", + " subprocess.run(cmd)\n", + "print(\"Finished conversion.\\n******\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HqJmf4WB5P1x" + }, + "source": [ + "You should now have a folder called `an4` that contains `etc/an4_train.transcription`, `etc/an4_test.transcription`, audio files in `wav/an4_clstk` and `wav/an4test_clstk`, along with some other files we will not need.\n", + "\n", + "We now build a few manifest files which will be used later:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "colab_type": "code", + "id": "AmR6CH025C8E", + "outputId": "0cd776ea-078f-4ab8-8a79-eed3e1c05839" + }, + "outputs": [], + "source": [ + "# --- Building Manifest Files --- #\n", + "import json\n", + "import librosa\n", + "\n", + "# Function to build a manifest\n", + "def build_manifest(transcripts_path, manifest_path, wav_path):\n", + " with open(transcripts_path, 'r') as fin:\n", + " with open(manifest_path, 'w') as fout:\n", + " for line in fin:\n", + " # Lines look like this:\n", + " # transcript (fileID)\n", + " transcript = line[: line.find('(')-1].lower()\n", + " transcript = transcript.replace('', '').replace('', '')\n", + " transcript = transcript.strip()\n", + "\n", + " file_id = line[line.find('(')+1 : -2] # e.g. \"cen4-fash-b\"\n", + " audio_path = os.path.join(\n", + " data_dir, wav_path,\n", + " file_id[file_id.find('-')+1 : file_id.rfind('-')],\n", + " file_id + '.wav')\n", + "\n", + " duration = librosa.core.get_duration(filename=audio_path)\n", + "\n", + " # Write the metadata to the manifest\n", + " metadata = {\n", + " \"audio_filepath\": audio_path,\n", + " \"duration\": duration,\n", + " \"text\": transcript\n", + " }\n", + " json.dump(metadata, fout)\n", + " fout.write('\\n')\n", + " \n", + "# Building Manifests\n", + "print(\"******\")\n", + "train_transcripts = data_dir + '/an4/etc/an4_train.transcription'\n", + "train_manifest = data_dir + '/an4/train_manifest.json'\n", + "build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')\n", + "print(\"Training manifest created.\")\n", + "\n", + "test_transcripts = data_dir + '/an4/etc/an4_test.transcription'\n", + "test_manifest = data_dir + '/an4/test_manifest.json'\n", + "build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n", + "print(\"Test manifest created.\")\n", + "print(\"******\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "EQsXzh7x5zIQ" + }, + "source": [ + "## Prepare the path to manifest files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vmOa0IRC5eW4" + }, + "outputs": [], + "source": [ + "dataset_basedir = os.path.join(data_dir, 'an4')\n", + "\n", + "train_dataset = os.path.join(dataset_basedir, 'train_manifest.json')\n", + "test_dataset = os.path.join(dataset_basedir, 'test_manifest.json')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pz9LC3yZ6J1Q" + }, + "source": [ + "## Read a few rows of the manifest file \n", + "\n", + "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", + "\n", + "1) `audio_filepath`: Refers to the path to the raw audio file
\n", + "2) `text`: The text transcript of this sample
\n", + "3) `duration`: The length of the audio file, in seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3OzZQiX751iz" + }, + "outputs": [], + "source": [ + "!head -n 5 {train_dataset}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pD9bprV66Oai" + }, + "source": [ + "# Data Augmentation Pipeline\n", + "\n", + "Constructing a data augmentation pipeline in NeMo is as simple as composing a nested dictionary that describes two things :\n", + "\n", + "1) The probability of that augmentation occuring - using the `prob` keyword
\n", + "2) The keyword arguments required by that augmentation class\n", + "\n", + "Below, we show a few samples of these augmentations. Note, in order to distinguish between the original sample and the perturbed sample, we exaggerate the perturbation strength significantly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "l5bc7gYO6MHG" + }, + "outputs": [], + "source": [ + "import torch\n", + "import IPython.display as ipd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "L8Bd8s3e6TeK" + }, + "source": [ + "## Audio file preparation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "g7f9riZz6Qnj" + }, + "outputs": [], + "source": [ + "# Import the data augmentation component from ASR collection\n", + "from nemo.collections.asr.parts import perturb, segment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wK8uwpt16d6I" + }, + "outputs": [], + "source": [ + "# Lets see the available perturbations\n", + "perturb.perturbation_types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "IP1VpkOA6nE-" + }, + "source": [ + "### Obtain a baseline audio file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sj4DNMmZ6ktm" + }, + "outputs": [], + "source": [ + "filepath = librosa.util.example_audio_file()\n", + "sample, sr = librosa.core.load(filepath)\n", + "\n", + "ipd.Audio(sample, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "M9mZNm296tNf" + }, + "source": [ + "### Convert to WAV format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "QDjlgLc-6vtq" + }, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "\n", + "# lets convert this ogg file into a wave to be compatible with NeMo\n", + "if not os.path.exists('./media'):\n", + " os.makedirs('./media/')\n", + " \n", + "filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", + "filepath = os.path.join('media', filename)\n", + "\n", + "sf.write(filepath, sample, samplerate=sr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FEkV-ikT6xgB" + }, + "outputs": [], + "source": [ + "sample, sr = librosa.core.load(filepath)\n", + "ipd.Audio(sample, rate=sr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gmuwEwIQ6zK3" + }, + "outputs": [], + "source": [ + "# NeMo has its own support class for loading wav files\n", + "def load_audio() -> segment.AudioSegment:\n", + " filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", + " filepath = os.path.join('media', filename)\n", + " sample_segment = segment.AudioSegment.from_file(filepath, target_sr=sr)\n", + " return sample_segment\n", + "\n", + "sample_segment = load_audio()\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hTnf1g1y63wZ" + }, + "source": [ + "## White Noise Perturbation\n", + "\n", + "White Noise perturbation is performed by the following steps :
\n", + "1) Randomly sample the amplitude of the noise from a uniformly distributed range (defined in dB)
\n", + "2) Sample gaussian noise (mean = 0, std = 1) with same length as audio signal
\n", + "3) Scale this gaussian noise by the amplitude (in dB scale)
\n", + "4) Add this noise vector to the original sample\n", + "\n", + "Notably, the original signal should not have a \"hissing sound\" constantly present in the perturbed version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2jaPQyUY65ij" + }, + "outputs": [], + "source": [ + "white_noise = perturb.WhiteNoisePerturbation(min_level=-50, max_level=-30)\n", + "\n", + "# Perturb the audio file\n", + "sample_segment = load_audio()\n", + "white_noise.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2dfwesJU7DhN" + }, + "source": [ + "## Shift Perturbation\n", + "\n", + "Shift perturbation is performed by the following steps :
\n", + "1) Randomly sample the shift factor of the signal from a uniformly distributed range (defined in milliseconds)
\n", + "2) Depending on the sign of the shift, we shift the original signal to the left or the right.
\n", + "3) The boundary locations are filled with zeros after the shift of the signal
\n", + "\n", + "Notably, the perturbed signal below skips the first 25 to 50 seconds of the original audio below, and the remainder of the time is simply silence. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2ONq8dBI7BZf" + }, + "outputs": [], + "source": [ + "shift = perturb.ShiftPerturbation(min_shift_ms=25000.0, max_shift_ms=50000.0)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_audio()\n", + "shift.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kywA3h4T7G_S" + }, + "source": [ + "## Data Dependent Perturbations\n", + "\n", + "Some perturbations require an external data source in order to perturb the original sample. Noise Perturbation is a perfect example of one such augmentation that requires an external noise source dataset in order to pertur the original data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eYm2DgGQ7KPe" + }, + "source": [ + "### Preparing a manifest of \"noise\" samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RXZ1o85E7FLT" + }, + "outputs": [], + "source": [ + "# Lets prepare a manifest file using the baseline file itself, cut into 1 second segments\n", + "\n", + "def write_manifest(filepath, data_dir='./media/', manifest_name='noise_manifest', duration_max=None, duration_stride=1.0, filter_long=False, duration_limit=10.0):\n", + " if duration_max is None:\n", + " duration_max = 1e9\n", + " \n", + " with open(os.path.join(data_dir, manifest_name + '.json'), 'w') as fout:\n", + " \n", + " try:\n", + " x, _sr = librosa.load(filepath)\n", + " duration = librosa.get_duration(x, sr=_sr)\n", + "\n", + " except Exception:\n", + " print(f\"\\n>>>>>>>>> WARNING: Librosa failed to load file {filepath}. Skipping this file !\\n\")\n", + " return\n", + "\n", + " if filter_long and duration > duration_limit:\n", + " print(f\"Skipping sound sample {filepath}, exceeds duration limit of {duration_limit}\")\n", + " return\n", + "\n", + " offsets = []\n", + " durations = []\n", + "\n", + " if duration > duration_max:\n", + " current_offset = 0.0\n", + "\n", + " while current_offset < duration:\n", + " difference = duration - current_offset\n", + " segment_duration = min(duration_max, difference)\n", + "\n", + " offsets.append(current_offset)\n", + " durations.append(segment_duration)\n", + "\n", + " current_offset += duration_stride\n", + "\n", + " else:\n", + " offsets.append(0.0)\n", + " durations.append(duration)\n", + "\n", + "\n", + " for duration, offset in zip(durations, offsets):\n", + " metadata = {\n", + " 'audio_filepath': filepath,\n", + " 'duration': duration,\n", + " 'label': 'noise',\n", + " 'text': '_', # for compatibility with ASRAudioText collection\n", + " 'offset': offset,\n", + " }\n", + "\n", + " json.dump(metadata, fout)\n", + " fout.write('\\n')\n", + " fout.flush()\n", + "\n", + " print(f\"Wrote {len(durations)} segments for filename {filename}\")\n", + " \n", + " print(\"Finished preparing manifest !\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wLTT8jlP7NdU" + }, + "outputs": [], + "source": [ + "filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", + "filepath = os.path.join('media', filename)\n", + "\n", + "# Write a \"noise\" manifest file\n", + "write_manifest(filepath, manifest_name='noise_1s', duration_max=1.0, duration_stride=1.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "izbdrSmd7PY5" + }, + "outputs": [], + "source": [ + "# Lets read this noise manifest file\n", + "noise_manifest_path = os.path.join('media', 'noise_1s.json')\n", + "\n", + "!head -n 5 {noise_manifest_path}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "82yq0TOV7Q_4" + }, + "outputs": [], + "source": [ + "# Lets create a helper method to load the first file in the train dataset of AN4\n", + "# Load the first sample in the manifest\n", + "def load_gsc_sample() -> segment.AudioSegment:\n", + " with open(train_dataset, 'r') as f:\n", + " line = f.readline()\n", + " \n", + " line = json.loads(line)\n", + " gsc_filepath = line['audio_filepath']\n", + " sample_segment = segment.AudioSegment.from_file(gsc_filepath)\n", + " return sample_segment\n", + "\n", + "gsc_sample_segment = load_gsc_sample()\n", + "ipd.Audio(gsc_sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zV9ypBqz7V9a" + }, + "source": [ + "## Noise Augmentation\n", + "\n", + "Noise perturbation is performed by the following steps :
\n", + "1) Randomly sample the amplitude scale of the noise sample from a uniformly distributed range (defined in dB)
\n", + "2) Randomly choose an audio clip from the set of noise audio samples available
\n", + "3) Compute the gain (in dB) required for the noise clip as compared to the original sample and scale the noise by this factor
\n", + "4) If the noise snippet is of shorter duration than the original audio, then randomly select an index in time from the original sample, where the noise snippet will be added
\n", + "5) If instead the noise snippet is longer than the duration of the original audio, then randomly subsegment the noise snippet and add the full snippet to the original audio
\n", + "\n", + "Notably, the noise perturbed sample should sound as if there are two sounds playing at the same time (overlapping audio) as compared to the original signal. The magnitude of the noise will be dependent on step (3) and the location where the noise is added will depend on steps (4) and (5)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cjSXci1v7Tlg" + }, + "outputs": [], + "source": [ + "import random\n", + "rng = random.Random(0)\n", + "noise = perturb.NoisePerturbation(manifest_path=noise_manifest_path,\n", + " min_snr_db=-10, max_snr_db=-10,\n", + " max_gain_db=300.0, rng=rng)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_gsc_sample()\n", + "noise.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kJjUkGJu7ern" + }, + "source": [ + "## Speed Perturbation\n", + "\n", + "Speed perturbation changes the speed of the speech, but does not preserve pitch of the sound. Try a few random augmentations to see how the pitch changes with change in duration of the audio file.\n", + "\n", + "**Note**: This is a very slow augmentation and is not advised to perform online augmentation for large datasets as it can dramatically increase training time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ic-ziInU7ZKC" + }, + "outputs": [], + "source": [ + "resample_type = 'kaiser_best' # Can be ['kaiser_best', 'kaiser_fast', 'fft', 'scipy']\n", + "speed = perturb.SpeedPerturbation(sr, resample_type, min_speed_rate=0.5, max_speed_rate=2.0, num_rates=-1)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_gsc_sample()\n", + "speed.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "bhHX3dyh7jPq" + }, + "source": [ + "## Time Stretch Perturbation\n", + "\n", + "Time Stretch perturbation changes the speed of the speech, and also preserve pitch of the sound. \n", + "Try a few random augmentations to see how the pitch remains close to the same with change in duration of the audio file." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8_kNSfcK7lfP" + }, + "source": [ + "### Note about speed optimizations\n", + "\n", + "Time stretch is a costly augmentation, and can easily cause training time to increase drastically. It is suggested that one installs the `numba` library using conda to use a more optimized augmentation kernel.\n", + "\n", + "```python\n", + "conda install numba\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Dpeb0QUZ7g3l" + }, + "outputs": [], + "source": [ + "time_stretch = perturb.TimeStretchPerturbation(min_speed_rate=0.5, max_speed_rate=2.0, num_rates=-1)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_gsc_sample()\n", + "time_stretch.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "vhH1-Ga87rCX" + }, + "source": [ + "# Augmentation Pipeline\n", + "\n", + "The augmentation pipeline can be constructed in multiple ways, either explicitly by instantiating the objects of these perturbations or implicitly by providing the arguments to these augmentations as a nested dictionary.\n", + "\n", + "We will show both approaches in the following sections" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RC8_NOD97tlW" + }, + "source": [ + "## Explicit definition" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "UwWE7swo72WP" + }, + "source": [ + "### Instantiate the perturbations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "GdLYn0hx7pRU" + }, + "outputs": [], + "source": [ + "perturbations = [\n", + " perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46),\n", + " perturb.GainPerturbation(min_gain_dbfs=0, max_gain_dbfs=50),\n", + " perturb.NoisePerturbation(manifest_path=noise_manifest_path,\n", + " min_snr_db=0, max_snr_db=50, max_gain_db=300.0)\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CDSSbZ8w7zzR" + }, + "source": [ + "### Select chance of perturbations being applied" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NmoxfLSL7xPJ" + }, + "outputs": [], + "source": [ + "probas = [1.0, 1.0, 0.5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wl0tnrMq79Jh" + }, + "source": [ + "### Prepare the audio augmentation object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "nO6T4U4f767o" + }, + "outputs": [], + "source": [ + "augmentations = list(zip(probas, perturbations))\n", + "\n", + "audio_augmentations = perturb.AudioAugmentor(augmentations)\n", + "audio_augmentations._pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9cgI9yUx8Cyv" + }, + "source": [ + "## Implicit definition\n", + "\n", + "Implicit definitions are preferred since they can be prepared in the actual configuration object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tiqrKFTM7_mH" + }, + "outputs": [], + "source": [ + "perturb.perturbation_types # Available perturbations" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dbeXwLdw8VEc" + }, + "source": [ + "### Prepare the nested dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mbE0qEA98TRI" + }, + "outputs": [], + "source": [ + "audio_augmentations = dict(\n", + " white_noise = dict(\n", + " prob=1.0,\n", + " min_level=-90,\n", + " max_level=-46\n", + " ),\n", + " gain = dict(\n", + " prob=1.0,\n", + " min_gain_dbfs=0,\n", + " max_gain_dbfs=50\n", + " ),\n", + " noise = dict(\n", + " prob=0.5,\n", + " manifest_path=noise_manifest_path,\n", + " min_snr_db=0,\n", + " max_snr_db=50,\n", + " max_gain_db=300.0\n", + " )\n", + ")\n", + "\n", + "audio_augmentations" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tcsoCe9-8ZM9" + }, + "source": [ + "### Supply `augmentor` as an argument to the `model.train_ds` config\n", + "\n", + "Most of the common datasets used by ASR models support the keyword `augmentor` - which can include a nested dictionary defining the implicit definition of an augmentation pipeline.\n", + "\n", + "Note, all ASR models support implicit declaration of augmentations. This includes - \n", + "\n", + "1) Speech To Label Models
\n", + "2) Speech To Text Models
\n", + "3) Speech To Text Models with BPE/WPE Support
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0WOJC0fdBL5J" + }, + "source": [ + "# Training - Application of augmentations\n", + "\n", + "We will be describing the data loaders for a MatchboxNet model from the paper \"[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)\". The benefit of MatchboxNet over JASPER models is that they use Separable Convolutions, which greatly reduce the number of parameters required to get good model accuracy.\n", + "\n", + "Care must be taken not to apply augmentations to the test set!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7iDWiIrzBzUA" + }, + "outputs": [], + "source": [ + "from omegaconf import OmegaConf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yv3KWNjcAUnQ" + }, + "outputs": [], + "source": [ + "# We will download the MatchboxNet configuration file for either v1 or v2 dataset here\n", + "DATASET_VER = 1\n", + "\n", + "if DATASET_VER == 1:\n", + " MODEL_CONFIG = \"matchboxnet_3x1x64_v1.yaml\"\n", + "else:\n", + " MODEL_CONFIG = \"matchboxnet_3x1x64_v2.yaml\"\n", + "\n", + "if not os.path.exists(f\"configs/{MODEL_CONFIG}\"):\n", + " !wget -P configs/ \"https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/asr/conf/{MODEL_CONFIG}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vOcv0ri3BkmA" + }, + "outputs": [], + "source": [ + "# This line will load the entire config of the MatchboxNet model\n", + "config_path = f\"configs/{MODEL_CONFIG}\"\n", + "config = OmegaConf.load(config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mLsyceMSCIHV" + }, + "source": [ + "### Augmentation in train set only\n", + "\n", + "Note how the train dataset config supports the `augmentor` implicit definition, however the test config does not.\n", + "\n", + "This is essential to avoid mistakenly performing Test Time Augmentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "VgUVm7lGB8Cz" + }, + "outputs": [], + "source": [ + "# Has `augmentor`\n", + "print(config.model.train_ds.pretty())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gURwQ2eyCE7o" + }, + "outputs": [], + "source": [ + "# Does not have `augmentor`\n", + "print(config.model.test_ds.pretty())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_UV74AVlCo_m" + }, + "source": [ + "# Custom Perturbations\n", + "\n", + "We can define and use custom perturbations as required simply by extending the `Perturbation` class. \n", + "\n", + "Lets look at how we can build a custom Noise Perturbation that we can use to evaluate the effect of noise at inference time, in order to analyse the model's robustness to noise\n", + "\n", + "In evaluation mode, we want to set an explicit value for the `snr_db` parameter instead of uniformly sample it from a range. This allows us to control the signal to noise ratio without relying on randomness from the training implementation of `NoisePerturbation`.\n", + "\n", + "Further, we force a random seed in order to produce reproduceable results on the evaluation set.\n", + "\n", + "With this combination, we can easily evaluate each sample in the test set `S` times (`S` being the number of random seeds), and can evaluate each of these samples at `D` levels of Signal to Noise Ratio (in dB). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Q9YBmBiZCbAX" + }, + "outputs": [], + "source": [ + "# We use a NeMo utility to parse the manifest file for us\n", + "from nemo.collections.asr.parts import collections, parsers\n", + "\n", + "class NoisePerturbationEval(perturb.Perturbation):\n", + " def __init__(\n", + " self, manifest_path=None, snr_db=40, max_gain_db=300.0, seed=None,\n", + " ):\n", + " seed = seed if seed is not None else 0\n", + " self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]))\n", + " self._snr_db = snr_db\n", + " self._max_gain_db = max_gain_db\n", + " self._rng = random.Random(seed)\n", + " \n", + " # This is mostly obtained from the original NoisePerturbation class itself\n", + " def perturb(self, data):\n", + " snr_db = self._snr_db\n", + " noise_record = self._rng.sample(self._manifest.data, 1)[0]\n", + " noise = AudioSegment.from_file(noise_record.audio_file, target_sr=data.sample_rate)\n", + " noise_gain_db = min(data.rms_db - noise.rms_db - snr_db, self._max_gain_db)\n", + "\n", + " # calculate noise segment to use\n", + " start_time = 0.0\n", + " if noise.duration > (start_time + data.duration):\n", + " noise.subsegment(start_time=start_time, end_time=start_time + data.duration)\n", + "\n", + " # adjust gain for snr purposes and superimpose\n", + " noise.gain_db(noise_gain_db)\n", + "\n", + " if noise._samples.shape[0] < data._samples.shape[0]:\n", + " noise_idx = data._samples.shape[0] // 2 # midpoint of audio\n", + " while (noise_idx + noise._samples.shape[0]) > data._samples.shape[0]:\n", + " noise_idx = noise_idx // 2 # half the initial starting point\n", + "\n", + " data._samples[noise_idx: noise_idx + noise._samples.shape[0]] += noise._samples\n", + "\n", + " else:\n", + " data._samples += noise._samples\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qR8qiwSkC1eE" + }, + "source": [ + "## Registering augmentations\n", + "\n", + "We can use either approach to submit this test time augmentation to the Data Loaders.\n", + "\n", + "In order to obtain the convenience of the implicit method, we must register this augmentation into NeMo's directory of available augmentations. This can be done as follows -" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "40Z4Fm88CxWA" + }, + "outputs": [], + "source": [ + "perturb.register_perturbation(name='noise_eval', perturbation=NoisePerturbationEval)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jVVbRxb-C4hB" + }, + "outputs": [], + "source": [ + "# Lets check the registry of allowed perturbations !\n", + "perturb.perturbation_types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2fiHz6CdC-B1" + }, + "source": [ + "## Overriding pre-existing augmentations\n", + "\n", + "**Note**: It is not allowed to overwrite already registered perturbations using the `perturb.register_perturbation` method. It will raise a `ValueError` in order to prevent overwriting the pre-existing perturbation types" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "05_Online_Noise_Augmentation.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 440d7895b5f30babdbb6cb53a10257a4fbc91834 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Mon, 31 Aug 2020 13:19:48 -0700 Subject: [PATCH 11/12] fix readme Signed-off-by: Oleksii Kuchaiev --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 9ab52fd967c4..886de4ad18fd 100644 --- a/README.rst +++ b/README.rst @@ -101,6 +101,7 @@ To run tutorials: * - NLP - Token Classification (Named Entity Recognition) - `Token_Classification_Named_Entity_Recognition_tutorial.ipynb `_ + * - NLP - Punctuation and Capitialization - `Punctuation_and_Capitalization.ipynb `_ From 45da106206c334f23ad0673993f66e3218db7e60 Mon Sep 17 00:00:00 2001 From: Jason Date: Mon, 31 Aug 2020 16:51:00 -0400 Subject: [PATCH 12/12] make tests better (#1094) Signed-off-by: Jason Co-authored-by: Oleksii Kuchaiev --- tests/collections/asr/test_asr_modules.py | 55 ++++++++++++++++------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/tests/collections/asr/test_asr_modules.py b/tests/collections/asr/test_asr_modules.py index 21f7b4c2c8d3..b90572f4e896 100644 --- a/tests/collections/asr/test_asr_modules.py +++ b/tests/collections/asr/test_asr_modules.py @@ -20,23 +20,48 @@ class TestASRModulesBasicTests: @pytest.mark.unit - def test_AudioToMelSpectrogramPreprocessor(self): - # Make sure constructor works - instance1 = modules.AudioToMelSpectrogramPreprocessor(dither=0) + def test_AudioToMelSpectrogramPreprocessor1(self): + # Test 1 that should test the pure stft implementation as much as possible + instance1 = modules.AudioToMelSpectrogramPreprocessor( + dither=0, stft_conv=False, mag_power=1.0, normalize=False, preemph=0.0, log=False, pad_to=0 + ) + instance2 = modules.AudioToMelSpectrogramPreprocessor( + dither=0, stft_conv=True, mag_power=1.0, normalize=False, preemph=0.0, log=False, pad_to=0 + ) + + # Ensure that the two functions behave similarily + for _ in range(10): + input_signal = torch.randn(size=(4, 512)) + length = torch.randint(low=161, high=500, size=[4]) + res1, length1 = instance1(input_signal=input_signal, length=length) + res2, length2 = instance2(input_signal=input_signal, length=length) + for len1, len2 in zip(length1, length2): + assert len1 == len2 + assert res1.shape == res2.shape + diff = torch.mean(torch.abs(res1 - res2)) + assert diff <= 1e-3 + diff = torch.max(torch.abs(res1 - res2)) + assert diff <= 1e-2 + + @pytest.mark.unit + def test_AudioToMelSpectrogramPreprocessor2(self): + # Test 2 that should test the stft implementation as used in ASR models + instance1 = modules.AudioToMelSpectrogramPreprocessor(dither=0, stft_conv=False) instance2 = modules.AudioToMelSpectrogramPreprocessor(dither=0, stft_conv=True) - # Make sure forward doesn't throw with expected input - input_signal = torch.randn(size=(4, 512)) - length = torch.randint(low=161, high=500, size=[4]) - res1, length1 = instance1(input_signal=input_signal, length=length) - res2, length2 = instance2(input_signal=input_signal, length=length) - for len1, len2 in zip(length1, length2): - assert len1 == len2 - assert res1.shape == res2.shape - diff = torch.mean(torch.abs(res1 - res2)) - assert diff <= 3e-3 - diff = torch.max(torch.abs(res1 - res2)) - assert diff <= 1 + # Ensure that the two functions behave similarily + for _ in range(5): + input_signal = torch.randn(size=(4, 512)) + length = torch.randint(low=161, high=500, size=[4]) + res1, length1 = instance1(input_signal=input_signal, length=length) + res2, length2 = instance2(input_signal=input_signal, length=length) + for len1, len2 in zip(length1, length2): + assert len1 == len2 + assert res1.shape == res2.shape + diff = torch.mean(torch.abs(res1 - res2)) + assert diff <= 3e-3 + diff = torch.max(torch.abs(res1 - res2)) + assert diff <= 2 @pytest.mark.unit def test_SpectrogramAugmentationr(self):