diff --git a/CHANGELOG.md b/CHANGELOG.md index 923ecb78cdd8..58b63fcec751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,6 +73,7 @@ To release a new version, please update the changelog as followed: - Added multi-dataset data-layer and dataset. ([PR #538](https://github.com/NVIDIA/NeMo/pull/538)) - @yzhang123 - Online Data Augmentation for ASR Collection. ([PR #565](https://github.com/NVIDIA/NeMo/pull/565)) - @titu1994 +- Speed augmentation on CPU, TimeStretch augmentation on CPU+GPU ([PR #594](https://github.com/NVIDIA/NeMo/pull/565)) - @titu1994 ### Changed diff --git a/Dockerfile b/Dockerfile index c3203a28930a..6010887ff8c0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,14 +30,23 @@ RUN apt-get update && \ python-dev && \ rm -rf /var/lib/apt/lists/* -# build trt open source plugins +# install trt ENV PATH=$PATH:/usr/src/tensorrt/bin WORKDIR /tmp/trt-oss -RUN git clone --recursive --branch release/7.0 https://github.com/NVIDIA/TensorRT.git && cd TensorRT && \ - mkdir build && cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release -DTRT_LIB_DIR=/usr/lib/x86_64-linux-gnu/ -DTRT_BIN_DIR=`pwd` \ - -DBUILD_PARSERS=OFF -DBUILD_SAMPLES=OFF -DBUILD_PLUGINS=ON -DGPU_ARCHS="70 75" && \ - make -j nvinfer_plugin +ARG NV_REPO=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 + +RUN cd /tmp/trt-oss +ARG DEB=libcudnn7_7.6.5.32-1+cuda10.2_amd64.deb +RUN curl -sL --output ${DEB} ${NV_REPO}/${DEB} +ARG DEB=libnvinfer7_7.0.0-1+cuda10.2_amd64.deb +RUN curl -sL --output ${DEB} ${NV_REPO}/${DEB} +ARG DEB=libnvinfer-plugin7_7.0.0-1+cuda10.2_amd64.deb +RUN curl -sL --output ${DEB} ${NV_REPO}/${DEB} +ARG DEB=libnvonnxparsers7_7.0.0-1+cuda10.2_amd64.deb +RUN curl -sL --output ${DEB} ${NV_REPO}/${DEB} +ARG DEB=python-libnvinfer_7.0.0-1+cuda10.2_amd64.deb +RUN curl -sL --output ${DEB} ${NV_REPO}/${DEB} +RUN dpkg -i *.deb && cd ../.. && rm -rf /tmp/trt-oss # install nemo dependencies WORKDIR /tmp/nemo @@ -53,9 +62,6 @@ FROM nemo-deps as nemo ARG NEMO_VERSION ARG BASE_IMAGE -# copy oss trt plugins -COPY --from=nemo-deps /tmp/trt-oss/TensorRT/build/libnvinfer_plugin.so* /usr/lib/x86_64-linux-gnu/ - # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container # version information as runtime environment variable for introspection purposes RUN /usr/bin/test -n "$NEMO_VERSION" && \ diff --git a/docs/sources/source/asr/installation.rst b/docs/sources/source/asr/installation.rst new file mode 100644 index 000000000000..63de9ae2b4f7 --- /dev/null +++ b/docs/sources/source/asr/installation.rst @@ -0,0 +1,30 @@ +Installation +============ + +Neural Modules and their corresponding collections have certain requirements that can be optionally installed to +improve performance of operations. + +Torch Audio +----------- + +The `torchaudio` library is used for certain audio pre-processing Neural Modules. Primarily, + + - AudioToMFCCPreprocessor + - TimeStretchAugmentation + +Official installation directions are provided at the `torchaudio github page `_. It is recommended to follow +the conda installation procedure and install the latest version of the library available on conda. + +Numba +----- + +The `numba` library is used for optimized execution of certain data augmentation procedures that can be used during +data pre-processing. It can substantially reduce execution time during training, and is a recommended installation for +Neural Modules. + +Official installation directions are provided at the `numba github page `_. It is recommended to follow +the conda installation procedure and install the latest version of the library available on conda. + +.. code-block:: bash + + conda install numba diff --git a/docs/sources/source/asr/intro.rst b/docs/sources/source/asr/intro.rst index f50aee692821..f8aac81833e9 100644 --- a/docs/sources/source/asr/intro.rst +++ b/docs/sources/source/asr/intro.rst @@ -6,6 +6,7 @@ Speech Recognition .. toctree:: :maxdepth: 8 + installation tutorial datasets models diff --git a/docs/sources/source/speech_command/installation_link.rst b/docs/sources/source/speech_command/installation_link.rst new file mode 100644 index 000000000000..cef1c53239c9 --- /dev/null +++ b/docs/sources/source/speech_command/installation_link.rst @@ -0,0 +1 @@ +.. include:: ../asr/installation.rst diff --git a/docs/sources/source/speech_command/intro.rst b/docs/sources/source/speech_command/intro.rst index 27a2b52836db..3e597be9a62c 100644 --- a/docs/sources/source/speech_command/intro.rst +++ b/docs/sources/source/speech_command/intro.rst @@ -7,6 +7,7 @@ Speech Commands .. toctree:: :maxdepth: 8 + installation_link tutorial datasets models diff --git a/examples/asr/notebooks/5_Online_Speech_Commands_Microphone_Demo.ipynb b/examples/asr/notebooks/5_Online_Speech_Commands_Microphone_Demo.ipynb new file mode 100644 index 000000000000..2ed61fb942d8 --- /dev/null +++ b/examples/asr/notebooks/5_Online_Speech_Commands_Microphone_Demo.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates speech command recognition from a microphone's stream in NeMo.\n", + "\n", + "It is **not a recommended** way to do inference in production workflows. If you are interested in \n", + "production-level inference using NeMo ASR models, please sign-up to Jarvis early access program: https://developer.nvidia.com/nvidia-jarvis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The notebook requires PyAudio library to get a signal from an audio device.\n", + "For Ubuntu, please run the following commands to install it:\n", + "```\n", + "sudo apt-get install -y portaudio19-dev\n", + "pip install pyaudio\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import nemo\n", + "import nemo.collections.asr as nemo_asr\n", + "import numpy as np\n", + "import pyaudio as pa\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Architecture and Weights\n", + "\n", + "The model architecture is defined in a YAML file available in the config directory. MatchboxNet 3x1x64 has been trained on the Google Speech Commands dataset (v2) version, and these weights are available on NGC. They will automatically be downloaded if not found." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models/nvidia:google_speech_commands_v2___matchboxnet_3x1x1\n", + "MODEL_YAML = '../configs/quartznet_speech_commands_3x1_v2.yaml'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the checkpoint files\n", + "base_checkpoint_path = './checkpoints/matchboxnet_v2-3x1x64/'\n", + "CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path, 'JasperEncoder-STEP-89000.pt')\n", + "CHECKPOINT_DECODER = os.path.join(base_checkpoint_path, 'JasperDecoderForClassification-STEP-89000.pt')\n", + "\n", + "if not os.path.exists(base_checkpoint_path):\n", + " os.makedirs(base_checkpoint_path)\n", + " \n", + "if not os.path.exists(CHECKPOINT_ENCODER):\n", + " !wget https://api.ngc.nvidia.com/v2/models/nvidia/google_speech_commands_v2___matchboxnet_3x1x1/versions/1/files/JasperEncoder-STEP-89000.pt -P {base_checkpoint_path};\n", + "\n", + "if not os.path.exists(CHECKPOINT_DECODER):\n", + " !wget https://api.ngc.nvidia.com/v2/models/nvidia/google_speech_commands_v2___matchboxnet_3x1x1/versions/1/files/JasperDecoderForClassification-STEP-89000.pt -P {base_checkpoint_path};" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Construct the Neural Modules and the eval graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ruamel.yaml import YAML\n", + "yaml = YAML(typ=\"safe\")\n", + "with open(MODEL_YAML) as f:\n", + " model_definition = yaml.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "neural_factory = nemo.core.NeuralModuleFactory(\n", + " placement=nemo.core.DeviceType.GPU,\n", + " backend=nemo.core.Backend.PyTorch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Neural Module to iterate over audio\n", + "\n", + "Here we define a custom Neural Module which acts as an iterator over a stream of audio that is supplied to it. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.backends.pytorch.nm import DataLayerNM\n", + "from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType\n", + "import torch\n", + "\n", + "# simple data layer to pass audio signal\n", + "class AudioDataLayer(DataLayerNM):\n", + " @property\n", + " def output_ports(self):\n", + " return {\n", + " 'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),\n", + " 'a_sig_length': NeuralType(tuple('B'), LengthsType()),\n", + " }\n", + "\n", + " def __init__(self, sample_rate):\n", + " super().__init__()\n", + " self._sample_rate = sample_rate\n", + " self.output = True\n", + " \n", + " def __iter__(self):\n", + " return self\n", + " \n", + " def __next__(self):\n", + " if not self.output:\n", + " raise StopIteration\n", + " self.output = False\n", + " return torch.as_tensor(self.signal, dtype=torch.float32), \\\n", + " torch.as_tensor(self.signal_shape, dtype=torch.int64)\n", + " \n", + " def set_signal(self, signal):\n", + " self.signal = np.reshape(signal.astype(np.float32)/32768., [1, -1])\n", + " self.signal_shape = np.expand_dims(self.signal.size, 0).astype(np.int64)\n", + " self.output = True\n", + "\n", + " def __len__(self):\n", + " return 1\n", + "\n", + " @property\n", + " def dataset(self):\n", + " return None\n", + "\n", + " @property\n", + " def data_iterator(self):\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiate the Neural Modules\n", + "\n", + "We now instantiate the neural modules and the encoder and decoder, set the weights of these models with the downloaded pretrained weights and construct the DAG to evaluate MatchboxNet on audio streams" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate necessary neural modules\n", + "data_layer = AudioDataLayer(sample_rate=model_definition['sample_rate'])\n", + "\n", + "data_preprocessor = nemo_asr.AudioToMFCCPreprocessor(\n", + " **model_definition['AudioToMFCCPreprocessor'])\n", + "\n", + "jasper_encoder = nemo_asr.JasperEncoder(\n", + " **model_definition['JasperEncoder'])\n", + "\n", + "jasper_decoder = nemo_asr.JasperDecoderForClassification(\n", + " feat_in=model_definition['JasperEncoder']['jasper'][-1]['filters'],\n", + " num_classes=len(model_definition['labels']))\n", + "\n", + "# load pre-trained model\n", + "jasper_encoder.restore_from(CHECKPOINT_ENCODER)\n", + "jasper_decoder.restore_from(CHECKPOINT_DECODER)\n", + "\n", + "# Define inference DAG\n", + "audio_signal, audio_signal_len = data_layer()\n", + "processed_signal, processed_signal_len = data_preprocessor(\n", + " input_signal=audio_signal,\n", + " length=audio_signal_len)\n", + "encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,\n", + " length=processed_signal_len)\n", + "log_probs = jasper_decoder(encoder_output=encoded)\n", + "\n", + "# inference method for audio signal (single instance)\n", + "def infer_signal(self, signal):\n", + " data_layer.set_signal(signal)\n", + " tensors = self.infer([log_probs], verbose=False)\n", + " logits = tensors[0][0]\n", + " return logits\n", + "\n", + "neural_factory.infer_signal = infer_signal.__get__(neural_factory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FrameASR: Helper class for streaming inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# class for streaming frame-based ASR\n", + "# 1) use reset() method to reset FrameASR's state\n", + "# 2) call transcribe(frame) to do ASR on\n", + "# contiguous signal's frames\n", + "class FrameASR:\n", + " \n", + " def __init__(self, neural_factory, model_definition,\n", + " frame_len=2, frame_overlap=2.5, \n", + " offset=10):\n", + " '''\n", + " Args:\n", + " frame_len: frame's duration, seconds\n", + " frame_overlap: duration of overlaps before and after current frame, seconds\n", + " offset: number of symbols to drop for smooth streaming\n", + " '''\n", + " self.vocab = list(model_definition['labels'])\n", + " self.vocab.append('_')\n", + " \n", + " self.sr = model_definition['sample_rate']\n", + " self.frame_len = frame_len\n", + " self.n_frame_len = int(frame_len * self.sr)\n", + " self.frame_overlap = frame_overlap\n", + " self.n_frame_overlap = int(frame_overlap * self.sr)\n", + " timestep_duration = model_definition['AudioToMFCCPreprocessor']['window_stride']\n", + " for block in model_definition['JasperEncoder']['jasper']:\n", + " timestep_duration *= block['stride'][0] ** block['repeat']\n", + " self.buffer = np.zeros(shape=2*self.n_frame_overlap + self.n_frame_len,\n", + " dtype=np.float32)\n", + " self.offset = offset\n", + " self.reset()\n", + " \n", + " def _decode(self, frame, offset=0):\n", + " assert len(frame)==self.n_frame_len\n", + " self.buffer[:-self.n_frame_len] = self.buffer[self.n_frame_len:]\n", + " self.buffer[-self.n_frame_len:] = frame\n", + " logits = neural_factory.infer_signal(self.buffer).to('cpu').numpy()[0]\n", + " decoded = self._greedy_decoder(\n", + " logits, \n", + " self.vocab\n", + " )\n", + " return decoded[:len(decoded)-offset]\n", + " \n", + " def transcribe(self, frame=None):\n", + " if frame is None:\n", + " frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)\n", + " if len(frame) < self.n_frame_len:\n", + " frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')\n", + " unmerged = self._decode(frame, self.offset)\n", + " \n", + " return unmerged\n", + " \n", + " def reset(self):\n", + " '''\n", + " Reset frame_history and decoder's state\n", + " '''\n", + " self.buffer=np.zeros(shape=self.buffer.shape, dtype=np.float32)\n", + " self.prev_char = ''\n", + "\n", + " @staticmethod\n", + " def _greedy_decoder(logits, vocab):\n", + " s = ''\n", + " \n", + " if logits.shape[0]:\n", + " s += str(vocab[np.argmax(logits)]) + \"\\n\"\n", + " \n", + " return s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# duration of signal frame, seconds\n", + "FRAME_LEN = 0.25\n", + "# number of audio channels (expect mono signal)\n", + "CHANNELS = 1\n", + "# sample rate, Hz\n", + "RATE = 16000\n", + "\n", + "CHUNK_SIZE = int(FRAME_LEN*RATE)\n", + "asr = FrameASR(neural_factory, model_definition,\n", + " frame_len=FRAME_LEN, frame_overlap=2.0, \n", + " offset=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# What classes can this model recognize?\n", + "\n", + "Before we begin inference on the actual audio stream, lets look at what are the classes this model was trained to recognize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels = model_definition['labels']\n", + "\n", + "for i in range(7):\n", + " for j in range(5):\n", + " print('%-10s' % (labels[i * 5 + j]), end=' ')\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Begin listening to audio stream and perform inference using FrameASR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p = pa.PyAudio()\n", + "print('Available audio input devices:')\n", + "for i in range(p.get_device_count()):\n", + " dev = p.get_device_info_by_index(i)\n", + " if dev.get('maxInputChannels'):\n", + " print(i, dev.get('name'))\n", + "print('Please type input device ID:')\n", + "dev_idx = int(input())\n", + "\n", + "empty_counter = 0\n", + "\n", + "def callback(in_data, frame_count, time_info, status):\n", + " global empty_counter\n", + " signal = np.frombuffer(in_data, dtype=np.int16)\n", + " text = asr.transcribe(signal)\n", + " if len(text):\n", + " print(text,end='')\n", + " empty_counter = 3\n", + " elif empty_counter > 0:\n", + " empty_counter -= 1\n", + " if empty_counter == 0:\n", + " print(' ',end='')\n", + " return (in_data, pa.paContinue)\n", + "\n", + "stream = p.open(format=pa.paInt16,\n", + " channels=CHANNELS,\n", + " rate=RATE,\n", + " input=True,\n", + " input_device_index=dev_idx,\n", + " stream_callback=callback,\n", + " frames_per_buffer=CHUNK_SIZE)\n", + "\n", + "print('Listening...')\n", + "\n", + "stream.start_stream()\n", + "\n", + "while stream.is_active():\n", + " time.sleep(0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stream.stop_stream()\n", + "stream.close()\n", + "p.terminate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/nlp/question_answering/question_answering_squad.py b/examples/nlp/question_answering/question_answering_squad.py index 2b6139f4ecff..db7ab14968d6 100755 --- a/examples/nlp/question_answering/question_answering_squad.py +++ b/examples/nlp/question_answering/question_answering_squad.py @@ -120,7 +120,10 @@ def parse_args(): ) parser.add_argument("--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.") parser.add_argument( - "--bert_checkpoint", default=None, type=str, help="Path to BERT model checkpoint for finetuning." + "--bert_checkpoint", default=None, type=str, help="Path to BERT encoder checkpoint for finetuning." + ) + parser.add_argument( + "--head_checkpoint", default=None, type=str, help="Path to BERT QA head checkpoint for finetuning." ) parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument( @@ -200,7 +203,7 @@ def parse_args(): "--save_step_freq", default=-1, type=int, - help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", + help="Frequency of saving checkpoint '-1' - epoch checkpoint won't be saved", ) parser.add_argument("--train_step_freq", default=100, type=int, help="Frequency of printing training loss") parser.add_argument( @@ -354,6 +357,9 @@ def create_pipeline( if args.bert_checkpoint is not None: model.restore_from(args.bert_checkpoint) + if args.head_checkpoint is not None: + qa_head.restore_from(args.head_checkpoint) + if "train" in args.mode: train_loss, train_steps_per_epoch, _, _ = create_pipeline( data_file=args.train_file, diff --git a/examples/nlp/text_classification/data/import_datasets.py b/examples/nlp/text_classification/data/import_datasets.py index 93137da14ca7..fc5f76e3db11 100644 --- a/examples/nlp/text_classification/data/import_datasets.py +++ b/examples/nlp/text_classification/data/import_datasets.py @@ -15,6 +15,7 @@ # ============================================================================= import argparse +import csv import glob import json import os @@ -43,7 +44,7 @@ def process_imdb(infold, outfold, uncased, modes=['train', 'test']): label = 0 else: label = 1 - files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt') + files = glob.glob(f'{infold}/{mode}/{sent}/*.txt') for file in files: with open(file, 'r') as f: review = f.read().strip() @@ -55,6 +56,50 @@ def process_imdb(infold, outfold, uncased, modes=['train', 'test']): outfiles[mode].close() +def process_chemprot(source_dir, target_dir, uncased, modes=['train', 'test', 'dev']): + if not os.path.exists(source_dir): + link = 'https://github.com/arwhirang/recursive_chemprot/tree/master/Demo/tree_LSTM/data' + raise ValueError(f'Data not found at {source_dir}. ' f'Please download ChemProt from {link}.') + + logging.info(f'Processing Chemprot dataset and store at {target_dir}') + os.makedirs(target_dir, exist_ok=True) + + naming_map = {'train': 'trainingPosit_chem', 'test': 'testPosit_chem', 'dev': 'developPosit_chem'} + + def _read_tsv(input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + outfiles = {} + label_mapping = {} + out_label_mapping = open(os.path.join(target_dir, 'label_mapping.tsv'), 'w') + for mode in modes: + outfiles[mode] = open(os.path.join(target_dir, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + input_file = os.path.join(source_dir, naming_map[mode]) + lines = _read_tsv(input_file) + for line in lines: + text = line[1] + label = line[2] + if label == "True": + label = line[3] + if uncased: + text = text.lower() + if label not in label_mapping: + out_label_mapping.write(f'{label}\t{len(label_mapping)}\n') + label_mapping[label] = len(label_mapping) + label = label_mapping[label] + outfiles[mode].write(f'{text}\t{label}\n') + for mode in modes: + outfiles[mode].close() + out_label_mapping.close() + + def process_thucnews(infold, outfold): modes = ['train', 'test'] train_size = 0.8 @@ -163,7 +208,7 @@ def process_nlu(filename, outfold, uncased, dataset_name, modes=['train', 'test' "--dataset_name", required=True, type=str, - choices=['sst-2', 'imdb', 'thucnews', 'nlu-chat', 'nlu-ubuntu', 'nlu-web'], + choices=['sst-2', 'imdb', 'thucnews', 'nlu-chat', 'nlu-ubuntu', 'nlu-web', 'chemprot'], ) parser.add_argument( "--source_data_dir", required=True, type=str, help='The path to the folder containing the dataset files.' @@ -199,6 +244,8 @@ def process_nlu(filename, outfold, uncased, dataset_name, modes=['train', 'test' elif dataset_name == 'nlu-web': infile = f'{source_dir}/WebApplicationsCorpus.json' process_nlu(filename=infile, outfold=target_dir, uncased=do_lower_case, dataset_name=dataset_name) + elif dataset_name == "chemprot": + process_chemprot(source_dir, target_dir, do_lower_case) else: raise ValueError( f'Dataset {dataset_name} is not supported.' diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py index f7d8515b69d2..9ce1bceb7cc9 100644 --- a/examples/nlp/text_classification/text_classification_with_bert.py +++ b/examples/nlp/text_classification/text_classification_with_bert.py @@ -37,9 +37,24 @@ ) parser.add_argument("--bert_checkpoint", default=None, type=str) parser.add_argument("--bert_config", default=None, type=str) +parser.add_argument( + "--tokenizer", + default="nemobert", + type=str, + choices=["nemobert", "sentencepiece"], + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", +) +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument( + "--tokenizer_model", + default=None, + type=str, + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", +) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--max_seq_length", default=36, type=int) parser.add_argument("--num_gpus", default=1, type=int) +parser.add_argument("--num_output_layers", default=1, type=int) parser.add_argument("--num_epochs", default=10, type=int) parser.add_argument("--num_train_samples", default=-1, type=int) parser.add_argument("--num_eval_samples", default=-1, type=int) @@ -54,7 +69,7 @@ "--use_cache", action='store_true', help="When specified loads and stores cache preprocessed data." ) parser.add_argument("--train_file_prefix", default='train', type=str) -parser.add_argument("--eval_file_prefix", default='test', type=str) +parser.add_argument("--eval_file_prefix", default='dev', type=str) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--class_balancing", default="None", type=str, choices=["None", "weighted_loss"]) parser.add_argument( @@ -64,6 +79,7 @@ parser.add_argument("--save_epoch_freq", default=1, type=int) parser.add_argument("--save_step_freq", default=-1, type=int) parser.add_argument('--loss_step_freq', default=25, type=int, help='Frequency of printing loss') +parser.add_argument('--eval_step_freq', default=100, type=int, help='Frequency of evaluation') parser.add_argument("--local_rank", default=None, type=int) args = parser.parse_args() @@ -84,13 +100,23 @@ hidden_size = model.hidden_size -tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_model=args.pretrained_model_name) +tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( + tokenizer_name=args.tokenizer, + pretrained_model_name=args.pretrained_model_name, + tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, +) data_desc = TextClassificationDataDesc(data_dir=args.data_dir, modes=[args.train_file_prefix, args.eval_file_prefix]) # Create sentence classification loss on top classifier = nemo_nlp.nm.trainables.SequenceClassifier( - hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout + hidden_size=hidden_size, + num_classes=data_desc.num_labels, + dropout=args.fc_dropout, + num_layers=args.num_output_layers, + log_softmax=False, ) if args.bert_checkpoint: @@ -108,7 +134,6 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_ logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' shuffle = args.shuffle_data if is_training else False - data_layer = nemo_nlp.nm.data_layers.BertTextClassificationDataLayer( input_file=data_file, tokenizer=tokenizer, @@ -117,7 +142,6 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_ shuffle=shuffle, batch_size=batch_size, use_cache=args.use_cache, - do_lower_case=args.do_lower_case, ) ids, type_ids, input_mask, labels = data_layer() @@ -174,7 +198,7 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_ user_iter_callback=lambda x, y: eval_iter_callback(x, y, data_layer), user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, f'{nf.work_dir}/graphs'), tb_writer=nf.tb_writer, - eval_step=steps_per_epoch, + eval_step=args.eval_step_freq, ) # Create callback to save checkpoints diff --git a/examples/nlp/token_classification/get_medical_data.py b/examples/nlp/token_classification/get_medical_data.py new file mode 100644 index 000000000000..bc213d06f360 --- /dev/null +++ b/examples/nlp/token_classification/get_medical_data.py @@ -0,0 +1,74 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + + +import argparse +import logging +import os +import subprocess + +from nemo import logging + +URL = { + 'bc5cdr': 'https://drive.google.com/uc?id=1OletxmPYNkz2ltOr9pyT0b0iBtUWxslh', + 'ncbi': 'https://drive.google.com/uc?id=1OletxmPYNkz2ltOr9pyT0b0iBtUWxslh', +} + + +def __maybe_download_file(destination: str, dataset: str): + """ + Downloads data from https://github.com/dmis-lab/biobert#datasets named entity recognition to destination if not exists. + If exists, skips download + Args: + destination: local filepath + dataset: dataset + """ + parent_source, child_source = dataset.split("-") + download_url = URL[parent_source] + if not os.path.exists(destination): + logging.info(f'Downloading {download_url} from https://github.com/dmis-lab/biobert#datasets to {destination}') + tmp_zip = '/tmp/data.zip' + tmp_unzip = '/tmp/data' + if not os.path.exists(tmp_unzip): + os.makedirs(tmp_unzip) + else: + subprocess.run(['rm', '-rf', tmp_unzip]) + subprocess.run(['gdown', '-O', tmp_zip, download_url]) + subprocess.run(['unzip', tmp_zip, '-d', tmp_unzip]) + + subprocess.run(['mv', os.path.join(tmp_unzip, f"{parent_source.upper()}-{child_source}"), destination]) + if os.path.exists(os.path.join(destination, "devel.tsv")): + subprocess.run(['mv', os.path.join(destination, "devel.tsv"), os.path.join(destination, "dev.tsv")]) + subprocess.run(['rm', '-rf', tmp_zip]) + subprocess.run(['rm', '-rf', tmp_unzip]) + else: + logging.info(f'{destination} found. Skipping download') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Prepare dataset') + parser.add_argument("--data_dir", required=True, type=str) + parser.add_argument( + "--dataset", default='bc5cdr-chem', choices=['bc5cdr-chem', 'bc5cdr-disease', 'ncbi-disease'], type=str + ) + args = parser.parse_args() + + if not os.path.exists(args.data_dir): + os.makedirs(args.data_dir) + + logging.info(f'Downloading dataset') + data_dir = os.path.join(args.data_dir, args.dataset) + __maybe_download_file(data_dir, args.dataset) diff --git a/examples/nlp/token_classification/import_from_iob_format.py b/examples/nlp/token_classification/import_from_iob_format.py index b07f60b125b8..0050e06fb69b 100644 --- a/examples/nlp/token_classification/import_from_iob_format.py +++ b/examples/nlp/token_classification/import_from_iob_format.py @@ -20,7 +20,7 @@ from nemo import logging -def __convert_data(in_file, out_text, out_labels): +def __convert_data(in_file, out_text, out_labels, max_length): """ in_file should be in the IOB format, see example here: https://www.clips.uantwerpen.be/conll2003/ner/. @@ -36,15 +36,59 @@ def __convert_data(in_file, out_text, out_labels): """ in_file = open(in_file, 'r') - with open(out_text, 'w') as text, open(out_labels, 'w') as labels: - for line in in_file: - if line == '\n': - text.write(line) - labels.write(line) - else: - line = line.split() - text.write(line[0] + ' ') - labels.write(line[-1] + ' ') + + if max_length == -1: + with open(out_text, 'w') as out_text, open(out_labels, 'w') as out_labels: + for line in in_file: + if line == '\n': + out_text.write(line) + out_labels.write(line) + else: + line = line.split() + out_text.write(line[0] + ' ') + out_labels.write(line[-1] + ' ') + + else: + lines = [] + words = [] + labels = [] + with open(out_text, 'w') as out_text, open(out_labels, 'w') as out_labels: + lines = in_file.readlines() + for line_id, line in enumerate(lines): + logging.info(f"{line_id} {len(lines)}") + contends = line.strip() + if len(contends) == 0: + assert len(words) == len(labels) + if len(words) > max_length: + # split if the sentence is longer than 30 + while len(words) > max_length: + tmplabel = labels[:max_length] + for iidx in range(len(tmplabel)): + if tmplabel.pop() == 'O': + break + l = ' '.join([label for label in labels[: len(tmplabel) + 1] if len(label) > 0]) + w = ' '.join([word for word in words[: len(tmplabel) + 1] if len(word) > 0]) + # lines.append([l, w]) + out_text.write(w + "\n") + out_labels.write(l + "\n") + words = words[len(tmplabel) + 1 :] + labels = labels[len(tmplabel) + 1 :] + + if len(words) == 0: + continue + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + # lines.append([l, w]) + out_text.write(w + "\n") + out_labels.write(l + "\n") + words = [] + labels = [] + continue + + word = line.strip().split()[0] + label = line.strip().split()[-1] + words.append(word) + labels.append(label) if __name__ == "__main__": @@ -53,23 +97,19 @@ def __convert_data(in_file, out_text, out_labels): + 'format to the format compatible with ' + 'nlp/examples/token_classification.py' ) - parser.add_argument("--data_dir", required=True, type=str) + parser.add_argument("--data_file", required=True, type=str) + parser.add_argument("--max_length", default=-1, type=int) args = parser.parse_args() - for dataset in ['dev.txt', 'train.txt']: - file_path = os.path.join(args.data_dir, dataset) - if not os.path.exists(file_path): - raise FileNotFoundError( - "{file_path} not found in {args.data_dir}" - "For NER, CoNLL-2003 dataset" - "can be obtained at" - "https://github.com/kyzhouhzau/BERT" - "-NER/tree/master/data." - ) + data_dir = os.path.dirname(args.data_file) + basename = os.path.basename(args.data_file) + prefix, ext = os.path.splitext(basename) + if not os.path.exists(args.data_file): + raise FileNotFoundError("{data_file} not found in {data_dir}") - logging.info(f'Processing {dataset}') - out_text = os.path.join(args.data_dir, 'text_' + dataset) - out_labels = os.path.join(args.data_dir, 'labels_' + dataset) + logging.info(f'Processing {args.data_file}') + out_text = os.path.join(data_dir, 'text_' + prefix + '.txt') + out_labels = os.path.join(data_dir, 'labels_' + prefix + '.txt') - __convert_data(file_path, out_text, out_labels) - logging.info(f'Processing of the {dataset} is complete') + __convert_data(args.data_file, out_text, out_labels, args.max_length) + logging.info(f'Processing of the {args.data_file} is complete') diff --git a/examples/nlp/token_classification/token_classification.py b/examples/nlp/token_classification/token_classification.py index 298190c94c54..15fd505b252e 100644 --- a/examples/nlp/token_classification/token_classification.py +++ b/examples/nlp/token_classification/token_classification.py @@ -51,8 +51,10 @@ parser.add_argument("--ignore_start_end", action='store_false') parser.add_argument("--ignore_extra_tokens", action='store_false') parser.add_argument("--none_label", default='O', type=str) +parser.add_argument("--mode", default='train_eval', choices=["train_eval", "train"], type=str) parser.add_argument("--no_shuffle_data", action='store_false', dest="shuffle_data") parser.add_argument("--no_time_to_log_dir", action="store_true", help="whether to add time to work_dir or not") +parser.add_argument("--batches_per_step", default=1, type=int, help="Number of iterations per step.") parser.add_argument( "--pretrained_model_name", default="bert-base-uncased", @@ -75,6 +77,8 @@ choices=["nemobert", "sentencepiece"], help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument("--do_lower_case", action='store_true') parser.add_argument( "--work_dir", default='output', @@ -95,6 +99,7 @@ help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", ) parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss") +parser.add_argument("--eval_step_freq", default=100, type=int, help="Frequency of evaluation") parser.add_argument("--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss") args = parser.parse_args() @@ -129,6 +134,8 @@ tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) if args.bert_checkpoint is not None: @@ -144,6 +151,7 @@ def create_pipeline( batch_size=args.batch_size, num_gpus=args.num_gpus, mode='train', + batches_per_step=args.batches_per_step, label_ids=None, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, @@ -208,7 +216,7 @@ def create_pipeline( if mode == 'train': loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask) - steps_per_epoch = len(data_layer) // (batch_size * num_gpus) + steps_per_epoch = len(data_layer) // (batch_size * num_gpus * batches_per_step) tensors_to_evaluate = [loss, logits] return tensors_to_evaluate, loss, steps_per_epoch, label_ids, classifier else: @@ -216,12 +224,9 @@ def create_pipeline( return tensors_to_evaluate, data_layer +callbacks = [] train_tensors, train_loss, steps_per_epoch, label_ids, classifier = create_pipeline() - -eval_tensors, data_layer = create_pipeline(mode='dev', label_ids=label_ids, classifier=classifier) - logging.info(f"steps_per_epoch = {steps_per_epoch}") - # Create trainer and execute training action train_callback = nemo.core.SimpleLossLoggerCallback( tensors=train_tensors, @@ -229,18 +234,24 @@ def create_pipeline( get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, ) +callbacks.append(train_callback) -eval_callback = nemo.core.EvaluatorCallback( - eval_tensors=eval_tensors, - user_iter_callback=lambda x, y: eval_iter_callback(x, y), - user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, label_ids, f'{nf.work_dir}/graphs'), - tb_writer=nf.tb_writer, - eval_step=steps_per_epoch, -) + +if "eval" in args.mode: + eval_tensors, data_layer = create_pipeline(mode='dev', label_ids=label_ids, classifier=classifier) + eval_callback = nemo.core.EvaluatorCallback( + eval_tensors=eval_tensors, + user_iter_callback=lambda x, y: eval_iter_callback(x, y), + user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, label_ids, f'{nf.work_dir}/graphs'), + tb_writer=nf.tb_writer, + eval_step=args.eval_step_freq, + ) + callbacks.append(eval_callback) ckpt_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) +callbacks.append(ckpt_callback) lr_policy_fn = get_lr_policy( args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion @@ -248,8 +259,9 @@ def create_pipeline( nf.train( tensors_to_optimize=[train_loss], - callbacks=[train_callback, eval_callback, ckpt_callback], + callbacks=callbacks, lr_policy=lr_policy_fn, + batches_per_step=args.batches_per_step, optimizer=args.optimizer_kind, optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay}, ) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 05458bf0f4b5..e1323f90c70b 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -971,12 +971,6 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType, dynamic_axes: defa # Make a deep copy of init parameters. init_params_copy = copy.deepcopy(module._init_params) - # Remove NeMo-related things from the module - # We need to change __call__ method. Note that this will change the - # whole class, not just this object! Which is why we need to repair it - # in the finally block - type(module).__call__ = torch.nn.Module.__call__ - # Reset standard instance field - making the file (probably) lighter. module._init_params = None module._placement = None @@ -985,6 +979,13 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType, dynamic_axes: defa module.eval() try: + # # Remove NeMo-related things from the module + # # We need to change __call__ method. Note that this will change the + # # whole class, not just this object! Which is why we need to repair it + # # in the finally block + __orig_call__ = type(module).__call__ + type(module).__call__ = torch.nn.Module.__call__ + if d_format == DeploymentFormat.TORCHSCRIPT: if input_example is None: # Route 1 - via torch.jit.script @@ -1036,15 +1037,7 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType, dynamic_axes: defa except Exception as e: # nopep8 logging.error(f'module export failed for {module} ' f'with exception {e}') finally: - - def __old_call__(self, force_pt=False, *input, **kwargs): - pt_call = len(input) > 0 or force_pt - if pt_call: - return nn.Module.__call__(self, *input, **kwargs) - else: - return NeuralModule.__call__(self, **kwargs) - - type(module).__call__ = __old_call__ + type(module).__call__ = __orig_call__ @staticmethod def deployment_export(module, output: str, d_format: DeploymentFormat, input_example=None, output_example=None): diff --git a/nemo/collections/asr/audio_preprocessing.py b/nemo/collections/asr/audio_preprocessing.py index 34914957d8e0..98c96b8520d9 100644 --- a/nemo/collections/asr/audio_preprocessing.py +++ b/nemo/collections/asr/audio_preprocessing.py @@ -24,14 +24,18 @@ 'CropOrPadSpectrogramAugmentation', 'MultiplyBatch', 'SpectrogramAugmentation', + 'TimeStretchAugmentation', ] import math import warnings from abc import abstractmethod +import numpy as np import torch +from packaging import version +import nemo from .parts.features import FilterbankFeatures from .parts.spectr_augment import SpecAugment, SpecCutout from nemo.backends.pytorch import NonTrainableNM @@ -41,6 +45,11 @@ try: import torchaudio + import torchaudio.transforms + import torchaudio.functional + + TORCHAUDIO_VERSION = version.parse(torchaudio.__version__) + TORCHAUDIO_VERSION_MIN = version.parse('0.5') HAVE_TORCHAUDIO = True except ModuleNotFoundError: @@ -52,6 +61,9 @@ warnings.warn("Unable to import APEX. Mixed precision and distributed training will not work.") +logging = nemo.logging + + class AudioPreprocessor(NonTrainableNM): """ A base class for Neural Modules that performs audio preprocessing, @@ -738,6 +750,178 @@ def output_ports(self): } +class TimeStretchAugmentation(NonTrainableNM): + def __init__( + self, + sample_rate: int, + probability: float, + min_speed_rate: float = 0.9, + max_speed_rate: float = 1.1, + num_rates: int = 5, + n_fft: int = 512, + ): + """ + Time-stretch a batch of audio series by a fixed rate while preserving pitch. + + Note that while the speed rate is sampled independently for every batch, + all samples of that batch will be augmented by the same speed rate. + + Note: + This is a simplified implementation, intended primarily for reference and pedagogical purposes. + It makes no attempt to handle transients, and is likely to produce audible artifacts. + + Args: + sample_rate: Sampling rate. + probability: Float value declaring chance of the input being augmented. + Must be a float value in the range [0, 1]. + min_speed_rate: Minimum sampling rate modifier. + max_speed_rate: Maximum sampling rate modifier. + num_rates: Number of discrete rates to allow. Can be a positive or negative + integer. + If a positive integer greater than 0 is provided, the range of + speed rates will be discretized into `num_rates` values. + If a negative integer or 0 is provided, the full range of speed rates + will be sampled uniformly. + Note: If a positive integer is provided and the resultant discretized + range of rates contains the value '1.0', then those samples with rate=1.0, + will not be augmented at all and simply skipped. This is to avoid unnecessary + augmentation and increase computation time. Effective augmentation chance + in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance + where `prob` is the global probability of a sample being augmented. + n_fft: Number of fft filters to be computed. + """ + super(TimeStretchAugmentation, self).__init__() + + if probability > 1.0 or probability < 0.0: + raise ValueError("`probability` must be between 0 and 1") + + if not HAVE_TORCHAUDIO: + raise ModuleNotFoundError( + "torchaudio is not installed but is necessary for " + "TimeStretchAugmentation. We recommend you try " + "installing it from conda for the PyTorch version you have." + ) + + # Check torchaudio version; inform user of potential issue + if TORCHAUDIO_VERSION < TORCHAUDIO_VERSION_MIN: + logging.error( + "Current installed version of `torchaudio` %s is less than the recommended minimum " + "version of %s. Please note that this may cause deadlocks when using distributed " + "data parallel training. Please follow the instructions at https://github.com/pytorch/audio " + "to update torchaudio.", + str(TORCHAUDIO_VERSION), + str(TORCHAUDIO_VERSION_MIN), + ) + + min_rate = min(min_speed_rate, max_speed_rate) + if min_rate < 0.0: + raise ValueError("Minimum sampling rate modifier must be > 0.") + + self._sample_rate = sample_rate + self.probability = float(probability) + self.min_rate = float(min_speed_rate) + self.max_rate = float(max_speed_rate) + self.num_rates = num_rates + if num_rates > 0: + self._rates = np.linspace(min_speed_rate, max_speed_rate, num_rates) + self._rng = np.random.RandomState() + + self._n_fft = n_fft + self._hop_length = n_fft // 2 + self._stft_window = torch.hann_window(self._n_fft, periodic=True, device=self._device) + self._phi_advance = torch.linspace(0, np.pi * self._hop_length, self._hop_length + 1, device=self._device) + self._phi_advance = self._phi_advance.view(-1, 1) + + @torch.no_grad() + def forward(self, input_signal, length): + proba = self._rng.uniform(0.0, 1.0) + + if proba > self.probability: + return input_signal, length + + # Select speed rate either from choice or random sample + if self.num_rates < 0: + speed_rate = self._rng.uniform(self.min_rate, self.max_rate) + else: + speed_rate = np.random.choice(self._rates) + + # Skip perturbation in case of identity speed rate + if speed_rate == 1.0: + return input_signal, length + + features = self._stft(input_signal, self._n_fft, self._hop_length) + features = self._phase_vocoder(features, speed_rate) + + # Predict the length of y_stretch + len_stretch = int(round(input_signal.shape[1] / speed_rate)) + + audio = self._istft(features, len_stretch) + + length = (length * speed_rate).type(torch.long) + + return audio, length + + def _stft(self, data: torch.Tensor, n_fft: int, hop_length: int): + win_length = n_fft + window = self._stft_window + + stft = torch.stft( + data, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=True, + pad_mode='reflect', + normalized=False, + ) + return stft + + def _phase_vocoder(self, data: torch.Tensor, rate: float): + data_stretch = torchaudio.functional.phase_vocoder(data, rate, self._phi_advance) + return data_stretch + + def _istft(self, data: torch.Tensor, len_stretch: int): + n_fft = 2 * (data.shape[1] - 1) + hop_length = self._hop_length + win_length = n_fft + window = self._stft_window + + audio = torchaudio.functional.istft( + data, + n_fft, + hop_length, + win_length, + window=window, + center=True, + pad_mode='reflect', + normalized=False, + length=len_stretch, + ) + + return audio + + @property + @add_port_docs() + def input_ports(self): + """Returns definitions of module input ports. + """ + return { + "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), + "length": NeuralType(tuple('B'), LengthsType()), + } + + @property + @add_port_docs() + def output_ports(self): + """Returns definitions of module output ports. + """ + return { + "processed_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), + "processed_length": NeuralType(tuple('B'), LengthsType()), + } + + def AudioPreprocessing(*args, **kwargs): raise NotImplementedError( "AudioPreprocessing has been deprecated and replaced by: " diff --git a/nemo/collections/asr/helpers.py b/nemo/collections/asr/helpers.py index 8d2608276f06..a2c7d0210470 100644 --- a/nemo/collections/asr/helpers.py +++ b/nemo/collections/asr/helpers.py @@ -257,12 +257,12 @@ def process_classification_evaluation_epoch(global_vars: dict, eval_metric=None, eloss = torch.mean(torch.stack(global_vars['EvalLoss'])).item() batch_sizes = global_vars['batchsize'] - total_num_samples = torch.tensor(batch_sizes).sum().float() + total_num_samples = torch.tensor(batch_sizes).sum().double() topk_accs = [] for k in top_k: correct_counts = torch.tensor(global_vars[f'CorrectCount@{k}']) - topk_acc = correct_counts.sum() / total_num_samples + topk_acc = correct_counts.sum().double() / total_num_samples topk_accs.append(topk_acc) if tag is None: diff --git a/nemo/collections/asr/parts/numba_utils.py b/nemo/collections/asr/parts/numba_utils.py new file mode 100644 index 000000000000..f7685e0b4ea9 --- /dev/null +++ b/nemo/collections/asr/parts/numba_utils.py @@ -0,0 +1,93 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from numba import jit + + +def phase_vocoder(D: np.ndarray, rate: float, phi_advance: np.ndarray, scale_buffer: np.ndarray): + """ + Optimized implementation of phase vocoder from Librosa. + + Reference implementation: + - https://librosa.github.io/librosa/generated/librosa.core.phase_vocoder.html + + Args: + D: Complex spectograms of shape [d, t, complex=2]. + rate: Speed rate, must be float greater than 0. + phi_advance: Precomputed phase advance buffer array of length [n_fft + 1] + scale_buffer: Precomputed numpy buffer array of length [n_fft + 1] + + Returns: + Complex64 ndarray of shape [d, t / rate, complex=2] + """ + time_steps = np.arange(0, D.shape[1], rate, dtype=np.float) + + # Create an empty output array + d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F') + + # Phase accumulator; initialize to the first sample + phase_acc = np.angle(D[:, 0]) + + # Pad 0 columns to simplify boundary logic + D = np.pad(D, [(0, 0), (0, 2)], mode='constant') + + d_stretch = _phase_vocoder_kernel(D, time_steps, phi_advance, d_stretch, phase_acc, scale_buffer) + + return d_stretch + + +@jit(nopython=True, nogil=True) +def _phase_vocoder_kernel(D, time_steps, phi_advance, d_stretch, phase_acc, scale_buffer): + """ + Numba optimized kernel to compute the phase vocoder step. + + Args: + D: Complex spectograms of shape [d, t, complex=2]. + rate: Speed rate, must be float greater than 0. + time_steps: Numpy ndarray of linearly spaced time steps, shape = [t] + phi_advance: Precomputed phase advance buffer array of length [n_fft + 1] + d_stretch: Output complex matrix of shape [d, t / rate, complex=2] + phase_acc: Phase accumulator initialized to first sample of shape [d, complex=2] + scale_buffer: Precomputed numpy buffer array of length [n_fft + 1] + + Returns: + Complex64 ndarray of shape [d, t / rate, complex=2] + """ + two_pi = 2.0 * np.pi + + for (t, step) in enumerate(time_steps): + columns = D[:, int(step) : int(step + 2)] + columns_0 = columns[:, 0] + columns_1 = columns[:, 1] + + # Weighting for linear magnitude interpolation + alpha = np.mod(step, 1.0) + mag = (1.0 - alpha) * np.abs(columns_0) + alpha * np.abs(columns_1) + + # Store to output array + d_stretch[:, t] = mag * np.exp(1.0j * phase_acc) + + # Compute phase advance + dphase = np.angle(columns_1) - np.angle(columns_0) - phi_advance + + # Wrap to -pi:pi range + scale = dphase / two_pi + np.round(scale, 0, scale_buffer) + + dphase = dphase - two_pi * scale_buffer + + # Accumulate phase + phase_acc += phi_advance + dphase + + return d_stretch diff --git a/nemo/collections/asr/parts/perturb.py b/nemo/collections/asr/parts/perturb.py index 44cdbeb15a33..f52635fbec06 100644 --- a/nemo/collections/asr/parts/perturb.py +++ b/nemo/collections/asr/parts/perturb.py @@ -10,6 +10,13 @@ from nemo.collections.asr.parts import collections, parsers from nemo.collections.asr.parts.segment import AudioSegment +try: + from nemo.collections.asr.parts import numba_utils + + HAVE_NUMBA = True +except (ImportError, ModuleNotFoundError): + HAVE_NUMBA = False + class Perturbation(object): def max_augmentation_length(self, length): @@ -20,20 +27,174 @@ def perturb(self, data): class SpeedPerturbation(Perturbation): - def __init__(self, min_speed_rate=0.85, max_speed_rate=1.15, rng=None): + def __init__(self, sr, resample_type, min_speed_rate=0.9, max_speed_rate=1.1, num_rates=5, rng=None): + """ + Performs Speed Augmentation by re-sampling the data to a different sampling rate, + which does not preserve pitch. + + Note: This is a very slow operation for online augmentation. If space allows, + it is preferable to pre-compute and save the files to augment the dataset. + + Args: + sr: Original sampling rate. + resample_type: Type of resampling operation that will be performed. + For better speed using `resampy`'s fast resampling method, use `resample_type='kaiser_fast'`. + For high-quality resampling, set `resample_type='kaiser_best'`. + To use `scipy.signal.resample`, set `resample_type='fft'` or `resample_type='scipy'` + min_speed_rate: Minimum sampling rate modifier. + max_speed_rate: Maximum sampling rate modifier. + num_rates: Number of discrete rates to allow. Can be a positive or negative + integer. + If a positive integer greater than 0 is provided, the range of + speed rates will be discretized into `num_rates` values. + If a negative integer or 0 is provided, the full range of speed rates + will be sampled uniformly. + Note: If a positive integer is provided and the resultant discretized + range of rates contains the value '1.0', then those samples with rate=1.0, + will not be augmented at all and simply skipped. This is to unnecessary + augmentation and increase computation time. Effective augmentation chance + in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance + where `prob` is the global probability of a sample being augmented. + rng: Random seed number. + """ + min_rate = min(min_speed_rate, max_speed_rate) + if min_rate < 0.0: + raise ValueError("Minimum sampling rate modifier must be > 0.") + + if resample_type not in ('kaiser_best', 'kaiser_fast', 'fft', 'scipy'): + raise ValueError("Supported `resample_type` values are ('kaiser_best', 'kaiser_fast', 'fft', 'scipy')") + + self._sr = sr + self._min_rate = min_speed_rate + self._max_rate = max_speed_rate + self._num_rates = num_rates + if num_rates > 0: + self._rates = np.linspace(self._min_rate, self._max_rate, self._num_rates, endpoint=True) + self._res_type = resample_type + self._rng = random.Random() if rng is None else rng + + def max_augmentation_length(self, length): + return length * self._max_rate + + def perturb(self, data): + # Select speed rate either from choice or random sample + if self._num_rates < 0: + speed_rate = self._rng.uniform(self._min_rate, self._max_rate) + else: + speed_rate = self._rng.choice(self._rates) + + # Skip perturbation in case of identity speed rate + if speed_rate == 1.0: + return + + new_sr = int(self._sr * speed_rate) + data._samples = librosa.core.resample(data._samples, self._sr, new_sr, res_type=self._res_type) + + +class TimeStretchPerturbation(Perturbation): + def __init__(self, min_speed_rate=0.9, max_speed_rate=1.1, num_rates=5, n_fft=512, rng=None): + """ + Time-stretch an audio series by a fixed rate while preserving pitch, based on [1, 2]. + + Note: + This is a simplified implementation, intended primarily for reference and pedagogical purposes. + It makes no attempt to handle transients, and is likely to produce audible artifacts. + + Reference + [1] [Ellis, D. P. W. “A phase vocoder in Matlab.” Columbia University, 2002.] + (http://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/) + [2] [librosa.effects.time_stretch] + (https://librosa.github.io/librosa/generated/librosa.effects.time_stretch.html) + + Args: + min_speed_rate: Minimum sampling rate modifier. + max_speed_rate: Maximum sampling rate modifier. + num_rates: Number of discrete rates to allow. Can be a positive or negative + integer. + If a positive integer greater than 0 is provided, the range of + speed rates will be discretized into `num_rates` values. + If a negative integer or 0 is provided, the full range of speed rates + will be sampled uniformly. + Note: If a positive integer is provided and the resultant discretized + range of rates contains the value '1.0', then those samples with rate=1.0, + will not be augmented at all and simply skipped. This is to avoid unnecessary + augmentation and increase computation time. Effective augmentation chance + in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance + where `prob` is the global probability of a sample being augmented. + n_fft: Number of fft filters to be computed. + rng: Random seed number. + """ + min_rate = min(min_speed_rate, max_speed_rate) + if min_rate < 0.0: + raise ValueError("Minimum sampling rate modifier must be > 0.") + self._min_rate = min_speed_rate self._max_rate = max_speed_rate + self._num_rates = num_rates + if num_rates > 0: + self._rates = np.linspace(self._min_rate, self._max_rate, self._num_rates, endpoint=True) self._rng = random.Random() if rng is None else rng + # Pre-compute constants + self._n_fft = int(n_fft) + self._hop_length = int(n_fft // 2) + + # Pre-allocate buffers + self._phi_advance_fast = np.linspace(0, np.pi * self._hop_length, self._hop_length + 1) + self._scale_buffer_fast = np.empty(self._hop_length + 1, dtype=np.float32) + + self._phi_advance_slow = np.linspace(0, np.pi * self._n_fft, self._n_fft + 1) + self._scale_buffer_slow = np.empty(self._n_fft + 1, dtype=np.float32) + def max_augmentation_length(self, length): return length * self._max_rate def perturb(self, data): - speed_rate = self._rng.uniform(self._min_rate, self._max_rate) - if speed_rate <= 0: - raise ValueError("speed_rate should be greater than zero.") - # logging.debug("speed: %f", speed_rate) - data._samples = librosa.effects.time_stretch(data._samples, speed_rate) + # Select speed rate either from choice or random sample + if self._num_rates < 0: + speed_rate = self._rng.uniform(self._min_rate, self._max_rate) + else: + speed_rate = self._rng.choice(self._rates) + + # Skip perturbation in case of identity speed rate + if speed_rate == 1.0: + return + + # Increase `n_fft` based on task (speed up or slow down audio) + # This greatly reduces upper bound of maximum time taken + # to compute slowed down audio segments. + if speed_rate >= 1.0: # Speed up audio + fft_multiplier = 1 + phi_advance = self._phi_advance_fast + scale_buffer = self._scale_buffer_fast + + else: # Slow down audio + fft_multiplier = 2 + phi_advance = self._phi_advance_slow + scale_buffer = self._scale_buffer_slow + + n_fft = int(self._n_fft * fft_multiplier) + hop_length = int(self._hop_length * fft_multiplier) + + # Perform short-term Fourier transform (STFT) + stft = librosa.core.stft(data._samples, n_fft=n_fft, hop_length=hop_length) + + # Stretch by phase vocoding + if HAVE_NUMBA: + stft_stretch = numba_utils.phase_vocoder(stft, speed_rate, phi_advance, scale_buffer) + + else: + stft_stretch = librosa.core.phase_vocoder(stft, speed_rate, hop_length) + + # Predict the length of y_stretch + len_stretch = int(round(len(data._samples) / speed_rate)) + + # Invert the STFT + y_stretch = librosa.core.istft( + stft_stretch, dtype=data._samples.dtype, hop_length=hop_length, length=len_stretch + ) + + data._samples = y_stretch class GainPerturbation(Perturbation): @@ -129,6 +290,7 @@ def perturb(self, data): perturbation_types = { "speed": SpeedPerturbation, + "time_stretch": TimeStretchPerturbation, "gain": GainPerturbation, "impulse": ImpulsePerturbation, "shift": ShiftPerturbation, diff --git a/nemo/collections/nlp/callbacks/text_classification_callback.py b/nemo/collections/nlp/callbacks/text_classification_callback.py index 5a1e313fdb76..bd62e6d89237 100644 --- a/nemo/collections/nlp/callbacks/text_classification_callback.py +++ b/nemo/collections/nlp/callbacks/text_classification_callback.py @@ -64,5 +64,5 @@ def eval_epochs_done_callback(global_vars, graph_fold): logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) plot_confusion_matrix(labels, preds, graph_fold) - logging.info(classification_report(labels, preds)) + logging.info(classification_report(labels, preds, digits=4)) return dict({"accuracy": accuracy}) diff --git a/nemo/collections/nlp/data/datasets/text_classification/text_classification_dataset.py b/nemo/collections/nlp/data/datasets/text_classification/text_classification_dataset.py index 35bde1987ca3..21af59b5ec60 100644 --- a/nemo/collections/nlp/data/datasets/text_classification/text_classification_dataset.py +++ b/nemo/collections/nlp/data/datasets/text_classification/text_classification_dataset.py @@ -49,14 +49,7 @@ class BertTextClassificationDataset(Dataset): """ def __init__( - self, - input_file, - max_seq_length, - tokenizer, - num_samples=-1, - shuffle=False, - use_cache=False, - do_lower_case=False, + self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=False, use_cache=False, ): self.input_file = input_file self.max_seq_length = max_seq_length @@ -65,7 +58,6 @@ def __init__( self.use_cache = use_cache self.shuffle = shuffle self.vocab_size = self.tokenizer.tokenizer.vocab_size - self.do_lower_case = do_lower_case if use_cache: data_dir, filename = os.path.split(input_file) @@ -99,9 +91,6 @@ def __init__( if index % 20000 == 0: logging.debug(f"Processing line {index}/{len(lines)}") - if do_lower_case: - line = line.lower() - line_splited = line.strip().split() sent_label = int(line_splited[-1]) sent_labels.append(sent_label) diff --git a/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py b/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py index 77aa0f1abf44..836e748c3370 100644 --- a/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py +++ b/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py @@ -61,7 +61,14 @@ def get_bert_special_tokens(bert_derivative): return MODEL_SPECIAL_TOKENS[bert_derivative] -def get_tokenizer(tokenizer_name, pretrained_model_name, tokenizer_model=None, special_tokens=None): +def get_tokenizer( + tokenizer_name, + pretrained_model_name, + tokenizer_model=None, + special_tokens=None, + vocab_file=None, + do_lower_case=False, +): ''' Args: tokenizer_name: sentencepiece or nemobert @@ -70,10 +77,14 @@ def get_tokenizer(tokenizer_name, pretrained_model_name, tokenizer_model=None, s To see the list of pretrained models, use: nemo_nlp.nm.trainables.get_bert_models_list() tokenizer_model (path): only used for sentencepiece tokenizer special_tokens (dict): dict of special tokens (Optional) + vocab_file (str): path to vocab file + do_lower_case (bool): (whether to apply lower cased) - only applicable when tokenizer is build with vocab file ''' if tokenizer_name == 'nemobert': - tokenizer = nemo.collections.nlp.data.tokenizers.NemoBertTokenizer(pretrained_model=pretrained_model_name) + tokenizer = nemo.collections.nlp.data.tokenizers.NemoBertTokenizer( + pretrained_model=pretrained_model_name, vocab_file=vocab_file, do_lower_case=do_lower_case + ) elif tokenizer_name == 'sentencepiece': if not os.path.exists(tokenizer_model): raise FileNotFoundError(f'{tokenizer_model} tokenizer model not found') diff --git a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py index 006dfeabc537..4aceabd54f60 100644 --- a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py @@ -38,7 +38,6 @@ class BertTextClassificationDataLayer(TextDataLayer): num_samples (int): number of samples to load. default is -1 which means all samples. shuffle (bool): whether to shuffle data or not. Default: False. batch_size: text segments batch size - do_lower_case (bool): whether to make the sentence all lower case dataset (BertTextClassificationDataset): the dataset that needs to be converted to DataLayerNM """ @@ -71,7 +70,6 @@ def __init__( shuffle=False, batch_size=64, use_cache=False, - do_lower_case=False, dataset_type=BertTextClassificationDataset, ): dataset_params = { @@ -81,6 +79,5 @@ def __init__( 'num_samples': num_samples, 'use_cache': use_cache, 'shuffle': shuffle, - 'do_lower_case': do_lower_case, } super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) diff --git a/nemo/collections/nlp/utils/callback_utils.py b/nemo/collections/nlp/utils/callback_utils.py index 04329eb02d0e..a6dc86c1e6f9 100644 --- a/nemo/collections/nlp/utils/callback_utils.py +++ b/nemo/collections/nlp/utils/callback_utils.py @@ -113,7 +113,7 @@ def get_classification_report(labels, preds, label_ids): if v in used_labels ] - return classification_report(labels, preds, target_names=labels_names) + return classification_report(labels, preds, target_names=labels_names, digits=4) def get_f1_scores(labels, preds, average_modes=['binary', 'weighted', 'macro', 'micro']): diff --git a/nemo/collections/tts/parts/waveglow.py b/nemo/collections/tts/parts/waveglow.py index 8fc011dd296e..0f6d74e05501 100644 --- a/nemo/collections/tts/parts/waveglow.py +++ b/nemo/collections/tts/parts/waveglow.py @@ -1,7 +1,8 @@ # Copyright (c) 2019 NVIDIA Corporation +from typing import Tuple + import torch import torch.nn.functional as F -from torch.autograd import Variable @torch.jit.script @@ -34,7 +35,7 @@ def __init__(self, c): W = W.view(c, c, 1) self.conv.weight.data = W - def forward(self, z, reverse=False): + def forward(self, z, reverse: bool = False): # shape batch_size, group_size, n_of_groups = z.size() @@ -44,17 +45,23 @@ def forward(self, z, reverse=False): if not hasattr(self, 'W_inverse'): # Reverse computation W_inverse = W.float().inverse() - W_inverse = Variable(W_inverse[..., None]) - if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor': + if z.dtype == torch.half: W_inverse = W_inverse.half() - self.W_inverse = W_inverse - z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) - return z + z = F.conv1d(z, W_inverse, bias=None, stride=1, padding=0) + # Tracer demands uniform output, i.e two tensors: + dummy = torch.zeros([1]) + return ( + z, + dummy, + ) else: # Forward computation log_det_W = batch_size * n_of_groups * torch.logdet(W.float()) z = self.conv(z) - return z, log_det_W + return ( + z, + log_det_W, + ) class WN(torch.nn.Module): @@ -105,8 +112,8 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_s res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') self.res_skip_layers.append(res_skip_layer) - def forward(self, forward_input): - audio, spect = forward_input + def forward(self, forward_input: Tuple[torch.Tensor, torch.Tensor]): + audio, spect = forward_input[0], forward_input[1] audio = self.start(audio) for i in range(self.n_layers): @@ -156,12 +163,12 @@ def __init__( self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) self.n_remaining_channels = n_remaining_channels - def forward(self, forward_input): + def forward(self, forward_input: Tuple[torch.Tensor, torch.Tensor]): """ forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames forward_input[1] = audio: batch x time """ - spect, audio = forward_input + spect, audio = forward_input[0], forward_input[1] # Upsample spectrogram to size of audio spect = self.upsample(spect) @@ -178,30 +185,33 @@ def forward(self, forward_input): log_s_list = [] log_det_W_list = [] - for k in range(self.n_flows): + k = 0 + for convinvk, wnk in zip(self.convinv, self.WN): if k % self.n_early_every == 0 and k > 0: output_audio.append(audio[:, : self.n_early_size, :]) audio = audio[:, self.n_early_size :, :] - audio, log_det_W = self.convinv[k](audio) + audio, log_det_W = convinvk(audio) log_det_W_list.append(log_det_W) n_half = int(audio.size(1) / 2) audio_0 = audio[:, :n_half, :] audio_1 = audio[:, n_half:, :] - output = self.WN[k]((audio_0, spect)) + output = wnk((audio_0, spect)) log_s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = torch.exp(log_s) * audio_1 + b log_s_list.append(log_s) - audio = torch.cat([audio_0, audio_1], 1) + audio = torch.cat((audio_0, audio_1), 1) + k += 1 output_audio.append(audio) return torch.cat(output_audio, 1), log_s_list, log_det_W_list - def infer(self, spect, sigma=1.0): + @torch.jit.ignore + def infer(self, spect, sigma: float = 1.0): spect = self.upsample(spect) # trim conv artifacts. maybe pad spec to kernel multiple time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] @@ -211,12 +221,13 @@ def infer(self, spect, sigma=1.0): spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) spect = spect.permute(0, 2, 1) - audio = torch.randn(spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device,).to( + audio = sigma * torch.randn(spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device).to( spect.dtype ) - audio = torch.autograd.Variable(sigma * audio) - + # k = int(self.n_flows - 1) + # TODO - when ModuleList will support reversed iterator, make it traceable + # for convinvk, wnk in zip(self.convinv, self.WN): for k in reversed(range(self.n_flows)): n_half = int(audio.size(1) / 2) audio_0 = audio[:, :n_half, :] @@ -226,26 +237,30 @@ def infer(self, spect, sigma=1.0): s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1 - b) / torch.exp(s) - audio = torch.cat([audio_0, audio_1], 1) + audio = torch.cat((audio_0, audio_1), 1) audio = self.convinv[k](audio, reverse=True) + audio = audio[0] if k % self.n_early_every == 0 and k > 0: - z = torch.randn(spect.size(0), self.n_early_size, spect.size(2), device=spect.device,).to(spect.dtype) - audio = torch.cat((sigma * z, audio), 1) + z = sigma * torch.randn(spect.size(0), self.n_early_size, spect.size(2), device=spect.device).to( + spect.dtype + ) + audio = torch.cat((z, audio), 1) + # k -= 1 audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data return audio - @staticmethod - def remove_weightnorm(model): - waveglow = model - for WN in waveglow.WN: - WN.start = torch.nn.utils.remove_weight_norm(WN.start) - WN.in_layers = remove(WN.in_layers) - WN.cond_layers = remove(WN.cond_layers) - WN.res_skip_layers = remove(WN.res_skip_layers) - return waveglow + +def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layers = remove(WN.cond_layers) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow def remove(conv_list): diff --git a/nemo/collections/tts/waveglow_modules.py b/nemo/collections/tts/waveglow_modules.py index 6d99381dc0cf..aa648d3aebaf 100644 --- a/nemo/collections/tts/waveglow_modules.py +++ b/nemo/collections/tts/waveglow_modules.py @@ -5,7 +5,7 @@ from nemo import logging from nemo.backends.pytorch.nm import LossNM, TrainableNM -from nemo.collections.tts.parts.waveglow import WaveGlow +from nemo.collections.tts.parts.waveglow import WaveGlow, remove_weightnorm from nemo.core.neural_types import * from nemo.utils.decorators import add_port_docs @@ -207,16 +207,15 @@ def denoise(self, audio, strength=0.1): audio_denoised = librosa.core.istft(audio_spec_denoised * audio_angles) return audio_denoised, audio_spec_denoised - def forward(self, mel_spectrogram): + def forward(self, mel_spectrogram, z): if not self._removed_weight_norm: logging.info("remove WN") - self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) + self.waveglow = remove_weightnorm(self.waveglow) self._removed_weight_norm = True if self.training: raise ValueError("You are using the WaveGlow Infer Neural Module in training mode.") with torch.no_grad(): - audio = self.waveglow.infer(mel_spectrogram, sigma=self._sigma) - return audio + return self.waveglow.forward((mel_spectrogram, z,))[0] class WaveGlowLoss(LossNM): diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py index d3da8a80fdf5..d78d0dc9923c 100644 --- a/nemo/core/neural_types/neural_type.py +++ b/nemo/core/neural_types/neural_type.py @@ -329,7 +329,6 @@ def rename(self, new_name): """ AppState().tensor_names.rename_NmTensor(self, new_name) - class NeuralTypeError(Exception): """Base class for neural type related exceptions.""" diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 98a9c9ab0358..6183526b87fe 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -78,15 +78,9 @@ def graphs(self): """ return self._neural_graph_manager -<<<<<<< HEAD - def register_module(self, module, name): - """ - Registers a module using the provided name. -======= def register_module(self, module, name: str) -> str: """ Registers a module using the provided name. ->>>>>>> nvidia/fix-sign If name is none - generates a new unique name. Args: diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index 439f30fa6af1..3097c09498fb 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -8,4 +8,5 @@ ruamel.yaml soundfile sox torch-stft -unidecode \ No newline at end of file +unidecode +packaging \ No newline at end of file diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index dbe779179061..32a857a35ac2 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -9,3 +9,4 @@ youtokentome numpy tqdm sklearn +gdown diff --git a/scripts/export_jasper_to_onnx.py b/scripts/export_jasper_to_onnx.py index dbb24023fa2f..daa9459394a0 100644 --- a/scripts/export_jasper_to_onnx.py +++ b/scripts/export_jasper_to_onnx.py @@ -1,5 +1,6 @@ # Copyright (c) 2019 NVIDIA Corporation import argparse +import os import torch from ruamel.yaml import YAML @@ -30,6 +31,13 @@ def get_parser(): parser.add_argument( "--pre-v09-model", action="store_true", help="Use if checkpoints were generated from NeMo < v0.9", ) + parser.add_argument( + "--decoder_type", + default='ctc', + type=str, + choices=['ctc', 'classification'], + help="Type of decoder used by the model.", + ) return parser @@ -42,6 +50,7 @@ def main( pre_v09_model=False, batch_size=1, time_steps=256, + decoder_type='ctc', ): yaml = YAML(typ="safe") @@ -54,6 +63,8 @@ def main( num_encoder_input_features = jasper_model_definition['AudioPreprocessing']['features'] elif 'AudioToMelSpectrogramPreprocessor' in jasper_model_definition: num_encoder_input_features = jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'] + elif 'AudioToMFCCPreprocessor' in jasper_model_definition: + num_encoder_input_features = jasper_model_definition['AudioToMFCCPreprocessor']['n_mfcc'] else: num_encoder_input_features = 64 num_decoder_input_features = jasper_model_definition['JasperEncoder']['jasper'][-1]['filters'] @@ -62,14 +73,30 @@ def main( nf = nemo.core.NeuralModuleFactory(create_tb_writer=False) + # Compatibility for `feat_in` defined in config file + if 'feat_in' in jasper_model_definition['JasperEncoder']: + jasper_model_definition['JasperEncoder'].pop('feat_in') + logging.info("Initializing models...") jasper_encoder = nemo_asr.JasperEncoder( feat_in=num_encoder_input_features, **jasper_model_definition['JasperEncoder'] ) - jasper_decoder = nemo_asr.JasperDecoderForCTC( - feat_in=num_decoder_input_features, num_classes=len(jasper_model_definition['labels']), - ) + if decoder_type == 'ctc': + jasper_decoder = nemo_asr.JasperDecoderForCTC( + feat_in=num_decoder_input_features, num_classes=len(jasper_model_definition['labels']), + ) + elif decoder_type == 'classification': + if 'labels' in jasper_model_definition: + num_classes = len(jasper_model_definition['labels']) + else: + raise ValueError("List of class labels must be defined in model config file with key 'labels'") + + jasper_decoder = nemo_asr.JasperDecoderForClassification( + feat_in=num_decoder_input_features, num_classes=num_classes + ) + else: + raise ValueError("`decoder_type` must be one of ['ctc', 'classification']") # This is necessary if you are using checkpoints trained with NeMo # version before 0.9 @@ -88,6 +115,15 @@ def main( jasper_encoder.restore_from(nn_encoder) jasper_decoder.restore_from(nn_decoder) + # Create export directories if they don't already exist + base_export_dir, export_fn = os.path.split(nn_onnx_encoder) + if not os.path.exists(base_export_dir): + os.makedirs(base_export_dir) + + base_export_dir, export_fn = os.path.split(nn_onnx_decoder) + if not os.path.exists(base_export_dir): + os.makedirs(base_export_dir) + logging.info("Exporting encoder...") nf.deployment_export( jasper_encoder, @@ -114,4 +150,5 @@ def main( args.onnx_encoder, args.onnx_decoder, pre_v09_model=args.pre_v09_model, + decoder_type=args.decoder_type, ) diff --git a/tests/configs/test_deploy_export.yaml b/tests/configs/test_deploy_export.yaml index 91fd940c603b..2867660ffe31 100644 --- a/tests/configs/test_deploy_export.yaml +++ b/tests/configs/test_deploy_export.yaml @@ -1,6 +1,6 @@ TaylorNet: header: - full_spec: nemo.backends.pytorch.tutorials.TaylorNet + full_spec: nemo.backends.pytorch.tutorials.toys.TaylorNet init_params: dim: 4 diff --git a/tests/unit/core/test_deploy_export.py b/tests/unit/core/test_deploy_export.py index 55d38dec82dc..f44006d13095 100644 --- a/tests/unit/core/test_deploy_export.py +++ b/tests/unit/core/test_deploy_export.py @@ -18,12 +18,9 @@ import copy import os -import sys from collections import OrderedDict -from os import path, sys from pathlib import Path -from unittest import TestCase - +import urllib.request import numpy as np # git clone git@github.com:microsoft/onnxruntime.git @@ -37,12 +34,12 @@ import onnxruntime as ort import pytest import torch -from ruamel.yaml import YAML import nemo -import nemo.collections.asr as nemo_asr import nemo.collections.nlp as nemo_nlp import nemo.collections.nlp.nm.trainables.common.token_classification_nm +import nemo.collections.tts as nemo_tts + from nemo import logging from nemo.core import DeploymentFormat as DF from nemo.core import NeuralModule @@ -87,17 +84,15 @@ def __test_export_route(self, module, out_name, mode, input_example=None): os.remove(out) module.eval() - outputs_fwd = ( - module.forward(*tuple(input_example.values())) - if isinstance(input_example, OrderedDict) - else ( - module.forward(*input_example) - if isinstance(input_example, tuple) - else module.forward(input_example) - if input_example is not None - else None - ) - ) + torch.manual_seed(1) + if isinstance(input_example, OrderedDict): + outputs_fwd = module.forward(*tuple(input_example.values())) + elif isinstance(input_example, tuple): + outputs_fwd = module.forward(*input_example) + elif input_example is not None: + outputs_fwd = module.forward(input_example) + else: + outputs_fwd = None deploy_input_example = ( tuple(input_example.values()) if isinstance(input_example, OrderedDict) else input_example @@ -110,7 +105,6 @@ def __test_export_route(self, module, out_name, mode, input_example=None): output_example=outputs_fwd, ) - tol = 5.0e-3 assert out.exists() == True if mode == DF.TRTONNX: @@ -167,15 +161,15 @@ def __test_export_route(self, module, out_name, mode, input_example=None): input_name = input_names[i] if input_name in module._disabled_deployment_input_ports: continue - inputs[input_name] = ( - input_example[input_name].cpu().numpy() - if isinstance(input_example, OrderedDict) - else ( - input_example[i].cpu().numpy() - if isinstance(input_example, tuple) - else input_example.cpu().numpy() - ) - ) + + if isinstance(input_example, OrderedDict): + for key in input_example.keys(): + if key in input_name: + inputs[input_name] = input_example[key].cpu().numpy() + elif isinstance(input_example, tuple): + inputs[input_name] = input_example[i].cpu().numpy() + else: + inputs[input_name] = input_example.cpu().numpy() out_dict = active_runner.infer(feed_dict=feed_dict, output=outputs_fwd) for ov in out_dict.values(): @@ -204,63 +198,59 @@ def __test_export_route(self, module, out_name, mode, input_example=None): elif mode == DF.ONNX: # Must recompute because *module* might be different now - outputs_fwd = ( - module.forward(*tuple(input_example.values())) - if isinstance(input_example, OrderedDict) - else ( - module.forward(*input_example) - if isinstance(input_example, tuple) - else module.forward(input_example) - ) - ) + if isinstance(input_example, OrderedDict): + outputs_fwd = module.forward(*tuple(input_example.values())) + elif isinstance(input_example, tuple): # or isinstance(input_example, list) + outputs_fwd = module.forward(*input_example) + elif input_example is not None: + outputs_fwd = module.forward(input_example) + else: + outputs_fwd = None + sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC ort_session = ort.InferenceSession(out_name, sess_options, ['CUDAExecutionProvider']) print('Execution Providers: ', ort_session.get_providers()) inputs = dict() - input_names = list(module.input_ports) + input_names = ( + list(input_example.keys()) + if isinstance(input_example, OrderedDict) + else list(module.input_ports.keys()) + ) ort_inputs = ort_session.get_inputs() - for i in range(len(input_names)): - input_name = input_names[i] - if input_name in module._disabled_deployment_input_ports: - continue - inputs[input_name] = ( - input_example[input_name].cpu().numpy() - if isinstance(input_example, OrderedDict) - else ( - input_example[i].cpu().numpy() - if isinstance(input_example, tuple) + + for node_arg in ort_inputs: + ort_name = node_arg.name + for input_name in input_names: + if input_name in ort_name or ort_name in input_name: + break + if ort_name not in inputs: + inputs[ort_name] = ( + input_example[input_name].cpu().numpy() + if isinstance(input_example, OrderedDict) else input_example.cpu().numpy() ) - ) - outputs_scr = ort_session.run(None, inputs) + + output_names = None + outputs_scr = ort_session.run(output_names, inputs) outputs_scr = torch.from_numpy(outputs_scr[0]).cuda() elif mode == DF.TORCHSCRIPT: - scr = torch.jit.load(out_name) - if isinstance(module, nemo.backends.pytorch.tutorials.TaylorNet): - input_example = torch.randn(4, 1).cuda() - outputs_fwd = module.forward(input_example) + tscr = torch.jit.load(out_name) outputs_scr = ( - module.forward(*tuple(input_example.values())) + tscr.forward(*tuple(input_example.values())) if isinstance(input_example, OrderedDict) else ( - module.forward(*input_example) - if isinstance(input_example, tuple) - else module.forward(input_example) + tscr.forward(*input_example) if isinstance(input_example, tuple) else tscr.forward(input_example) ) ) elif mode == DF.PYTORCH: module.load_state_dict(torch.load(out_name)) - module.eval() - outputs_scr = ( - module.forward(*tuple(input_example.values())) - if isinstance(input_example, OrderedDict) - else ( - module.forward(*input_example) - if isinstance(input_example, tuple) - else module.forward(input_example) - ) - ) + if isinstance(input_example, OrderedDict): + outputs_scr = module.forward(*tuple(input_example.values())) + elif isinstance(input_example, tuple) or isinstance(input_example, list): + outputs_scr = module.forward(*input_example) + else: + outputs_scr = module.forward(input_example) outputs_scr = ( outputs_scr[0] if isinstance(outputs_scr, tuple) or isinstance(outputs_scr, list) else outputs_scr @@ -269,6 +259,9 @@ def __test_export_route(self, module, out_name, mode, input_example=None): outputs_fwd[0] if isinstance(outputs_fwd, tuple) or isinstance(outputs_fwd, list) else outputs_fwd ) + n = outputs_fwd.numel() + tol = 5.0e-3 if n < 10000 else (5.0e-2 if n < 100000 else (5.0e-1)) + assert (outputs_scr - outputs_fwd).norm(p=2) < tol if out.exists(): @@ -280,7 +273,7 @@ def __test_export_route(self, module, out_name, mode, input_example=None): "input_example, module_name, df_type", [ # TaylorNet export tests. - (None, "TaylorNet", DF.TORCHSCRIPT), + (torch.randn(4, 1), "TaylorNet", DF.PYTORCH), # TokenClassifier export tests. (torch.randn(16, 16, 512), "TokenClassifier", DF.ONNX), (torch.randn(16, 16, 512), "TokenClassifier", DF.TORCHSCRIPT), @@ -327,9 +320,7 @@ def test_module_export(self, tmpdir, input_example, module_name, df_type): @pytest.mark.unit @pytest.mark.run_only_on('GPU') - @pytest.mark.parametrize( - "df_type", [DF.ONNX, DF.TORCHSCRIPT, DF.PYTORCH, pytest.param(DF.TRTONNX, marks=requires_trt)] - ) + @pytest.mark.parametrize("df_type", [DF.ONNX, DF.TORCHSCRIPT, DF.PYTORCH]) def test_hf_bert(self, tmpdir, df_type): """ Tests BERT export. @@ -350,3 +341,28 @@ def test_hf_bert(self, tmpdir, df_type): tmp_file_name = str(tmpdir.mkdir("export").join("bert")) # Test export. self.__test_export_route(module=bert, out_name=tmp_file_name, mode=df_type, input_example=input_example) + + @pytest.mark.unit + @pytest.mark.run_only_on('GPU') + @pytest.mark.parametrize("df_type", [DF.ONNX, DF.TORCHSCRIPT, DF.PYTORCH]) + def test_waveglow(self, tmpdir, df_type): + url = "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ljspeech/versions/2/files/WaveGlowNM.pt" + ptfile = "./WaveGlowNM.pt" + if not Path(ptfile).is_file(): + urllib.request.urlretrieve(url, ptfile) + + module = nemo_tts.WaveGlowInferNM(sample_rate=22050) + module.restore_from(ptfile) + module.eval() + + torch.manual_seed(1) + mel = torch.randn(1, 80, 96).cuda() + stride = 256 # value from waveglow upsample + n_group = 8 + z_size2 = (mel.size(2) * stride) // n_group + z = torch.randn(1, z_size2).cuda() + + input_example = OrderedDict([("mel_spectrogram", mel), ("z", z)]) + tmp_file_name = str(tmpdir.mkdir("export").join("waveglow")) + + self.__test_export_route(module=module, out_name=tmp_file_name, mode=df_type, input_example=input_example) diff --git a/tests/unit/test_unit_speech_commands.py b/tests/unit/test_unit_speech_commands.py index 7e92b8762858..d8563ceafd3a 100644 --- a/tests/unit/test_unit_speech_commands.py +++ b/tests/unit/test_unit_speech_commands.py @@ -21,6 +21,7 @@ import unittest from unittest import TestCase +import numpy as np import pytest from ruamel.yaml import YAML @@ -83,27 +84,78 @@ def tearDownClass(cls) -> None: @pytest.mark.unit def test_pytorch_audio_dataset_with_perturbation(self): - perturbations = [ - perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46), - perturb.ShiftPerturbation(min_shift_ms=-5.0, max_shift_ms=5.0), - ] + def construct_perturbed_dataset(perturbation): + if perturbation is not None: + # Execute perturbations with 100% probability + prob_perturb = [(1.0, perturbation)] + audio_augmentor = perturb.AudioAugmentor(prob_perturb) + else: + audio_augmentor = None - # Execute perturbations with 100% probability - prob_perturb = [(1.0, p) for p in perturbations] + featurizer = WaveformFeaturizer( + sample_rate=self.featurizer_config['sample_rate'], + int_values=self.featurizer_config['int_values'], + augmentor=audio_augmentor, + ) + + ds = AudioLabelDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer) + return ds + + baseline_ds = construct_perturbed_dataset(perturbation=None) + num_samples = len(baseline_ds) + + # test white noise perturbation + white_noise_perturbation = perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46) + white_noise_ds = construct_perturbed_dataset(white_noise_perturbation) + max_range = 10.0 ** (-46 / 20.0) + min_range = 10.0 ** (-90 / 20.0) + rng = np.random.RandomState(0) + + for i in range(num_samples): + xp = white_noise_ds[i][0] + xp_max = rng.randn(xp.shape[0]) * max_range + xp_min = rng.randn(xp.shape[0]) * min_range + + # Compute z statistic + z_max = (xp.mean() - xp_max.mean()) / np.sqrt(np.square(xp.std()) + np.square(xp_max.std())) + z_min = (xp.mean() - xp_min.mean()) / np.sqrt(np.square(xp.std()) + np.square(xp_min.std())) + self.assertTrue(z_max < 0.01) + self.assertTrue(z_min < 0.01) - audio_augmentor = perturb.AudioAugmentor(prob_perturb) + # test shift perturbation + shift_perturbation = perturb.ShiftPerturbation(min_shift_ms=-5.0, max_shift_ms=5.0) + shift_ds = construct_perturbed_dataset(shift_perturbation) - featurizer = WaveformFeaturizer( - sample_rate=self.featurizer_config['sample_rate'], - int_values=self.featurizer_config['int_values'], - augmentor=audio_augmentor, + for i in range(num_samples): + x = baseline_ds[i][0] + xp = shift_ds[i][0] + delta = np.abs(x - xp) + count_zeros = np.count_nonzero(delta == 0.0) + self.assertTrue(count_zeros >= 0) + + # test time stretch perturbation + ts_perturbation = perturb.TimeStretchPerturbation(min_speed_rate=0.9, max_speed_rate=1.1, num_rates=4) + timestretch_ds = construct_perturbed_dataset(ts_perturbation) + + for i in range(num_samples): + x = baseline_ds[i][0] + xp = timestretch_ds[i][0] + self.assertTrue((x.shape[0] > xp.shape[0]) or (x.shape[0] < xp.shape[0])) + + # test speed perturbation + speed_perturbation = perturb.SpeedPerturbation( + sr=self.featurizer_config['sample_rate'], + resample_type='kaiser_fast', + min_speed_rate=0.9, + max_speed_rate=1.1, + num_rates=4, ) - ds = AudioLabelDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer,) + speed_ds = construct_perturbed_dataset(speed_perturbation) - for i in range(len(ds)): - logging.info(ds[i]) - # logging.info(ds[i][0].shape) - # self.assertEqual(freq, ds[i][0].shape[0]) + for i in range(num_samples): + x = baseline_ds[i][0] + xp = speed_ds[i][0] + self.assertTrue((x.shape[0] > xp.shape[0]) or (x.shape[0] < xp.shape[0])) @pytest.mark.unit def test_dataloader(self): @@ -182,6 +234,9 @@ def test_audio_preprocessors(self): if installed_torchaudio: to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None) to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) + time_stretch_augment = nemo_asr.TimeStretchAugmentation( + self.featurizer_config['sample_rate'], probability=1.0, min_speed_rate=0.9, max_speed_rate=1.1 + ) to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50) @@ -195,6 +250,7 @@ def test_audio_preprocessors(self): if installed_torchaudio: spec = to_spectrogram.forward(input_signals, seq_lengths) mfcc = to_mfcc.forward(input_signals, seq_lengths) + ts_input_signals = time_stretch_augment.forward(input_signals, seq_lengths) # Check that number of features is what we expect self.assertTrue(melspec[0].shape[1] == 50) @@ -202,3 +258,7 @@ def test_audio_preprocessors(self): if installed_torchaudio: self.assertTrue(spec[0].shape[1] == 201) # n_fft // 2 + 1 bins self.assertTrue(mfcc[0].shape[1] == 15) + + timesteps = ts_input_signals[0].shape[1] + self.assertTrue(timesteps <= int(1.15 * self.featurizer_config['sample_rate'])) + self.assertTrue(timesteps >= int(0.85 * self.featurizer_config['sample_rate']))