From 4827060843f29df2693fa6a27626d5bdaf55c41f Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 10 Feb 2023 08:21:59 -0800 Subject: [PATCH] ONNX export for RadTTS (#5880) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Megatron positional encoding alibi fix (#5808) (#5863) * 1. Debugging. * 1. Debugging. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 1. Debugging. * 1. Debugging. * 1. Fixed initialization. Signed-off-by: Micha Livne * 1. Debugging. * 1. Debugging. * 1. Debugging. * 1. Debugging. * 1. Debugging. * 1. Debugging. * 1. Debugging. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 1. Debugging. * 1. Removed scale from ALiBi. Signed-off-by: Micha Livne * 1. Updated yaml and added support to control number of alibi heads. Signed-off-by: Micha Livne * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 1. Removed num_attention_heads_alibi from configs. Signed-off-by: Micha Livne Signed-off-by: Micha Livne Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Micha Livne Signed-off-by: Micha Livne Co-authored-by: Micha Livne Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Micha Livne Signed-off-by: Jason * Fix segmenting for pcla inference (#5849) * Fix segmenting for pcla inference Signed-off-by: Matvei Novikov * Fix segmenting for pcla inference Signed-off-by: Matvei Novikov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Matvei Novikov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * indentation fix (#5861) (#5862) Signed-off-by: nithinraok Signed-off-by: nithinraok Signed-off-by: nithinraok Co-authored-by: Nithin Rao Signed-off-by: Jason * add ambernet to readme (#5872) (#5873) Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: Jason * Fix wrong label mapping in batch_inference for label_model (#5767) (#5870) * fix batch inference * add test for batch * fix device Signed-off-by: fayejf Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: Jason * WAR for https://github.com/pytorch/pytorch/pull/91526 Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Fix memory allocation of NeMo Multi-speaker Data Simulator (#5864) * fix data simulator Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * Adding noise_manifest handling for faster speed Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added multi-gpu feature Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added a parameter for noise source file number Signed-off-by: Taejin Park * Fixed noise_manifest error bug Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: stevehuang52 Signed-off-by: Taejin Park Co-authored-by: Taejin Park Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * RETRO model finetuning (#5800) * add save and load dynmaic index Signed-off-by: Yi Dong * add chunk stride feature Signed-off-by: Yi Dong * add chunk stride feature Signed-off-by: Yi Dong * add no pq index Signed-off-by: Yi Dong * added megatron lm compatible mode Signed-off-by: Yi Dong * addd config Signed-off-by: Yi Dong * fix position embedding Signed-off-by: Yi Dong * added index factory Signed-off-by: Yi Dong * share neighbors and weights amoung strategies Signed-off-by: Yi Dong * fix bug Signed-off-by: Yi Dong * added metric tto faiss index Signed-off-by: Yi Dong * set default to inner product Signed-off-by: Yi Dong * added qa fine tuen dataset Signed-off-by: Yi Dong * added fine tuning code Signed-off-by: Yi Dong * trim it Signed-off-by: Yi Dong * fix data issue Signed-off-by: Yi Dong * fix style Signed-off-by: Yi Dong * added version Signed-off-by: Yi Dong * fix key error Signed-off-by: Yi Dong * make sure to overwrite the cfg Signed-off-by: Yi Dong * make multiple sentence bert available Signed-off-by: Yi Dong * fix the document Signed-off-by: Yi Dong * fix the table Signed-off-by: Yi Dong * fix transformer Signed-off-by: Yi Dong * make sure to turn off the rope in chunked cross attention layer Signed-off-by: Yi Dong * fix the security issue Signed-off-by: Yi Dong * style fix Signed-off-by: Yi Dong * fix codeql issues Signed-off-by: Yi Dong * fix Signed-off-by: Yi Dong * use -1 Signed-off-by: Yi Dong * fix empty index Signed-off-by: Yi Dong * clean up Signed-off-by: Yi Dong * fix the lower bound for repetition penalty Signed-off-by: Yi Dong * add retro qa inference strategy Signed-off-by: Yi Dong * added new inference logic Signed-off-by: Yi Dong * working inference Signed-off-by: Yi Dong * fix TP inference Signed-off-by: Yi Dong * revert requirement Signed-off-by: Yi Dong * added file inference Signed-off-by: Yi Dong * use string to prevent collison Signed-off-by: Yi Dong * use NQ test Signed-off-by: Yi Dong * fix prompt Signed-off-by: Yi Dong * fix inference Signed-off-by: Yi Dong * set good defaults for demo Signed-off-by: Yi Dong * replicate adlr Signed-off-by: Yi Dong * make sure to turn off attention reset for megatron lm compatible model Signed-off-by: Yi Dong * style fix Signed-off-by: Yi Dong * fix typo Signed-off-by: Yi Dong * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix inference error Signed-off-by: Yi Dong * fix logging Signed-off-by: Yi Dong * address comments Signed-off-by: Yi Dong --------- Signed-off-by: Yi Dong Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * [TTS] GAN-based spectrogram enhancer (#5565) * [TTS] add SpectrogramEnhancer based on StyleGAN 2 Signed-off-by: Roman Korostik * [TTS] some tests for spectrogram enhancer Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: a tiny clean up Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: log images during training Signed-off-by: Roman Korostik * exp_manager: pass save_on_train_epoch_end to checkpointing callback Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: add training script and config examples Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix comments Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: don't assume FastPitch Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: better input shapes handling Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix porting error Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix logging and .nemo saving Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: clean up scaling Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: formatting Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: update examples Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: shape handling Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: remove LoggerCollection handling Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: copyright notice for tests Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: use process_batch helper Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: return empty list of available models Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: some docs Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: style --fix Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: chan_last -> channel_last Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: remove unused imports Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: remove unused return value Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: losses are nn.Modules now Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: init optimizers from config Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: formatting Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: unused imports Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: typechecking Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: more tests Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix logging images Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: unclutter prepare_batch Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: init generator and discriminator from the config for consistency with other NeMo models Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: update spectrogram range in the example config Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: comment on loss weights in the example config Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: rename Conv2DMod to Conv2DModulated Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: remove unused imports Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix CodeQL import warnings Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: type_as_recursive -> to_device_recursive Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: move to_device_recursive to helpers Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: move losses to a separate module, add comments Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: add optimizers' entries to config Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix test configs Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: support length masking for 3-dim tensors Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: add masking to spectrogram normalization Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix tests Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: add spectrogram normalization tests Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix imports and formatting in tests Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix docstring typo Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: rename G and D to generator and discriminator Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: better argument naming in interfaces (condition -> input_spectograms, target -> target_spectrograms) Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: formatting Signed-off-by: Roman Korostik * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [TTS] SpectrogramEnhancer: fix import warnings in modules Signed-off-by: Roman Korostik * [TTS] add resynthesize_dataset.py script Signed-off-by: Roman Korostik * [TTS] add PairedRealFakeSpectrogramsDataset Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: update example config to reflect new data setup Signed-off-by: Roman Korostik * [TTS] resynthesize_dataset.py: remove unused imports Signed-off-by: Roman Korostik * [TTS] resynthesize_dataset.py: use nemo manifest handling Signed-off-by: Roman Korostik * [TTS] resynthesize_dataset.py: remove unused import Signed-off-by: Roman Korostik * [TTS] resynthesize_dataset.py: underscores for .npy names Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: remove return value from a test Signed-off-by: Roman Korostik * [TTS] add length masking helper Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: use common tts length mask function Signed-off-by: Roman Korostik * [TTS] unused imports in tts helpers Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: fix an import Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: introduce computed upsample_factor to generator Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: clean up and clarify validation data setup Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: remove a hardcoded path in the example config Signed-off-by: Roman Korostik * [TTS] SpectrogramEnhancer: configurize max_spectrogram_length in generator Signed-off-by: Roman Korostik * [TTS] resynthesize_dataset.py: consistent dashes and underscores in CLI args Signed-off-by: Roman Korostik * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Roman Korostik Signed-off-by: Roman Korostik Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * Optimizing distributed Adam when running with one work queue (#5560) * Dist Adam constructs a single param bucket for each GPT layer Signed-off-by: Tim Moon * Synchronize dist Adam reduce-scatters before launching model-parallel all-reduces Signed-off-by: Tim Moon * Configure per-layer dist Adam buckets for BERT and T5 Signed-off-by: Tim Moon * Remove unused variables Signed-off-by: Tim Moon * Configure GPT with one dist Adam bucket per virtual pipeline stage Signed-off-by: Tim Moon * Configure BERT with one dist Adam bucket per virtual pipeline stage Signed-off-by: Tim Moon * Update Apex commit in Dockerfile Need recent updates to Apex distributed Adam optimizer. Signed-off-by: Tim Moon * Remove logic for per-virtual-pipeline distopt buckets from T5 Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Signed-off-by: Jason * fix(readme): fix typo (#5883) Signed-off-by: Jean-Louis Queguiner Signed-off-by: Jason * TTS inference with Heteronym classification model, hc model inference refactoring (#5768) * refactor inference, fix span detection Signed-off-by: ekmb * fix merge conflicts Signed-off-by: ekmb * fix merge conflicts Signed-off-by: ekmb * remove unused var Signed-off-by: ekmb * clean up, test update Signed-off-by: ekmb * arg name update Signed-off-by: ekmb * merge wip Signed-off-by: ekmb * revert changes Signed-off-by: ekmb * update docs, move heteronym to baseg2p Signed-off-by: ekmb * change wordid file defaults to none Signed-off-by: ekmb * add manifest check Signed-off-by: ekmb * replace homograph with heteronym, upper case wordid for riva, review feedback Signed-off-by: ekmb * add log message, update comment Signed-off-by: ekmb * rename test manifest field Signed-off-by: ekmb --------- Signed-off-by: ekmb Signed-off-by: Jason * take out retro doc (#5885) (#5886) Signed-off-by: Yi Dong Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Signed-off-by: Jason * Add option to disable distributed parameters in distributed Adam optimizer (#5685) * Add option to run dist Adam without distributed params Similar to DDP, but leverages dist Adam's support for overlapping communication with backward compute Signed-off-by: Tim Moon * Fix bug in grad clipping when dist Adam has redundant params Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: Oleksii Kuchaiev Signed-off-by: Jason * [ASR] Separate Audio-to-Text (BPE, Char) dataset construction (#5774) * Separate full BPE dataset construction Signed-off-by: Vladimir Bataev * Fix the case when the dataset is None Signed-off-by: Vladimir Bataev * Fix comment Signed-off-by: Vladimir Bataev * Fix typos Signed-off-by: Vladimir Bataev * Separate char dataset construction. Fix DALI dataset usage. Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev Signed-off-by: Jason * transformer duration added and IPA config files added Signed-off-by: Jason * inference issue for pace resolved Signed-off-by: Jason * Latest ONNX develpoments Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Remove MCD_DTW tarball (#5889) Signed-off-by: Jocelyn Huang Signed-off-by: Jason * Block large files from being merged into NeMo main (#5898) * Attempt to use large-file pre-commit ci hook Signed-off-by: SeanNaren * Set defaults and enforce Signed-off-by: SeanNaren * Set to 1000 Signed-off-by: SeanNaren * Remove enforcement Signed-off-by: SeanNaren --------- Signed-off-by: SeanNaren Signed-off-by: Jason * Reduce memory usage in getMultiScaleCosAffinityMatrix function (#5876) * Updated offline_clustering.py, the getMultiScaleCosAffinityMatrix function, reduced memory usage Signed-off-by: gabitza-tech * torch.empty.cache() outside forward_infer() Signed-off-by: Taejin Park * Removed unnecessary lines Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Speed up for non torch.jit.script Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * parallelism is default off Signed-off-by: Taejin Park * nme_mat_size is unified as 512, removing redundant docstring Signed-off-by: Taejin Park --------- Signed-off-by: gabitza-tech Signed-off-by: Taejin Park Co-authored-by: Taejin Park Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * set max_steps for lr decay through config (#5780) * set max_steps for lr decay through config * added warning for optim sched max_steps config option * reverted changes to modelPT and updated megatron_base_model * added the experimental cosine annealing scheduler class * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update decay_steps for consine annealing exp class * added copyright --------- Co-authored-by: ANMOL GUPTA Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Signed-off-by: Jason * Fix transducer and question answering tutorial bugs bugs (#5809) (#5810) Co-authored-by: Zhilin Wang Co-authored-by: Eric Harper Signed-off-by: Jason * update apex install instructions (#5901) (#5902) Signed-off-by: ericharper Co-authored-by: Eric Harper Signed-off-by: Jason * Hybrid ASR-TTS models (#5659) Add hybrid ASR-TTS models and text-to-text dataset Signed-off-by: Vladimir Bataev Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * Set providers for ORT inference session (#5903) Signed-off-by: athitten Signed-off-by: Jason * [ASR] Configurable metrics for audio-to-audio + removed experimental decorators (#5827) * Added an option to configure metrics for audio-to-audio models Removed experimental decorators Signed-off-by: Ante Jukić * Addressed review comments Signed-off-by: Ante Jukić --------- Signed-off-by: Ante Jukić Signed-off-by: Jason * Correct doc for RNNT transcribe() function (#5904) Signed-off-by: smajumdar Signed-off-by: Jason * Add segmentation export to Audacity label file (#5857) * Save the segmentation as label file for Audacity Audacity is a free open source audio editor that can import label file to quickly assess the segmentation quality. This commit add the export to [Audacity label format](https://manual.audacityteam.org/man/importing_and_exporting_labels.html) so that directly after running the segmentation tool the segmentation quality can be assessed or the segmentation can be shared easily. Signed-off-by: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> * Fix styling Signed-off-by: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused score in audacity export score is not written in audacity label file so we can safely not load it from segment. Signed-off-by: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> --------- Signed-off-by: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * Cross-Lingual objectives (XLM) and multilingual (many-many) support for Megatron-NMT (#5026) * Update blendable dataset, and refactor seq2seq data Signed-off-by: MaximumEntropy * Blendable dataset with binarized mmap working Signed-off-by: MaximumEntropy * Pass seed from cfg to dataset Signed-off-by: MaximumEntropy * Fix multilingual setup Signed-off-by: MaximumEntropy * Add on epoch start reconfiguration Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * Update tokenizer creation for multilingual Signed-off-by: MaximumEntropy * Tmp Signed-off-by: MaximumEntropy * Update NMT script Signed-off-by: MaximumEntropy * Remove unused import Signed-off-by: MaximumEntropy * Update training script Signed-off-by: MaximumEntropy * Log consumed samples Signed-off-by: MaximumEntropy * Logging on val epoch end Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * Remove redundant print Signed-off-by: MaximumEntropy * Ckpt averaging for non model parallel megatron models Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * Empty Signed-off-by: MaximumEntropy * Update error message Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * Remove check Signed-off-by: MaximumEntropy * Restore fixes Signed-off-by: MaximumEntropy * Remove ipdb Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * Move to classmethods Signed-off-by: MaximumEntropy * Initial Signed-off-by: MaximumEntropy * 1. Debugging. Signed-off-by: Micha Livne * Refactor masking to add skip_masking_id and working xlm bert and t5 datasets Signed-off-by: MaximumEntropy * 1. Debugging. Signed-off-by: Micha Livne * 1. Testing a simple solution Signed-off-by: Micha Livne * 1. Fixed. Seems to work. Need to validate. Signed-off-by: Micha Livne * 1. Added support in CSV and text memmap toMEgatron encoder-decoder Signed-off-by: Micha Livne * 1. Added support in CSV. Signed-off-by: Micha Livne * 1. Fixed style. Signed-off-by: Micha Livne * 1. Fixed style. 2. Fixed bugs. Signed-off-by: Micha Livne * 1. Debugging. Signed-off-by: Micha Livne * 1. Fixed bugs. Signed-off-by: Micha Livne * 1. Fixed style. Signed-off-by: Micha Livne * 1. Updated yaml. Signed-off-by: Micha Livne * Minor Signed-off-by: MaximumEntropy * 1. Fixed warnings. Signed-off-by: Micha Livne * 1. Fixed style. Signed-off-by: Micha Livne * 1. Fixed style. Signed-off-by: Micha Livne * 1. Fixed a bug. Signed-off-by: Micha Livne * Tmp Signed-off-by: MaximumEntropy * Updates Signed-off-by: MaximumEntropy * Fix minor data things Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * Lang ids for validation datasets Signed-off-by: MaximumEntropy * More fixes for lang id code at inference Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * Remove pdb Signed-off-by: MaximumEntropy * Fix prepend ID and bleu logging Signed-off-by: MaximumEntropy * Refactor Signed-off-by: MaximumEntropy * Fixes for many-many NMT Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * Reset o2 default Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Restore dataset utils Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * Allreduce bleu scores Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * 1. Loading index file into memmap object. Signed-off-by: Micha Livne * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 1. Fixed style. Signed-off-by: Micha Livne * 1. Fixed extentin when loading files. Signed-off-by: Micha Livne * Fix Signed-off-by: MaximumEntropy * Fix redundant building Signed-off-by: MaximumEntropy * PP > 2 for NMT Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Merge and fix Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix Signed-off-by: MaximumEntropy * Refactor multilingual again Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixes Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor and verify data formats Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * cleanup Signed-off-by: MaximumEntropy * more fixes Signed-off-by: MaximumEntropy * Fix passing langs Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixes Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * More fixes Signed-off-by: MaximumEntropy * Fixes for bart Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: MaximumEntropy Signed-off-by: Micha Livne Signed-off-by: Micha Livne Co-authored-by: Micha Livne Co-authored-by: Micha Livne Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Micha Livne Signed-off-by: Jason * ONNX export working Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Fixing unit test Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Update isort to the latest version (#5895) Update isort to the latest version Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev Signed-off-by: Jason * Pin isort version (#5914) Signed-off-by: Vladimir Bataev Signed-off-by: Jason * Moved eval notebook data to aws (#5911) Signed-off-by: Jocelyn Huang Signed-off-by: Jason * FilterbankFeaturesTA to match FilterbankFeatures (#5913) Signed-off-by: Mohamed Saad Ibn Seddik Signed-off-by: Jason * fixed missing long_description_content_type (#5909) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * added TPMLP for T5-based models (#5840) (#5841) Signed-off-by: David Mosallanezhad Co-authored-by: David Co-authored-by: David Mosallanezhad Co-authored-by: Eric Harper Signed-off-by: Jason * Fixing 0-size issue and ONNX BS>1 trace Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Fixing code scan alert Signed-off-by: Boris Fomitchev Signed-off-by: Jason * update container (#5917) Signed-off-by: ericharper Signed-off-by: Jason * remove conda pynini install (#5921) Signed-off-by: ekmb Signed-off-by: Jason * Merge release main (#5916) * update branch Signed-off-by: ericharper * added TPMLP for T5-based models (#5840) Signed-off-by: David Mosallanezhad Signed-off-by: David Mosallanezhad Co-authored-by: David Mosallanezhad * remove notebook (#5859) Signed-off-by: ericharper Signed-off-by: ericharper * update branch Signed-off-by: ericharper --------- Signed-off-by: ericharper Signed-off-by: David Mosallanezhad Co-authored-by: David Co-authored-by: David Mosallanezhad Signed-off-by: Jason * Dynamic freezing in Nemo (#5879) * Initial commit for dynamic freezing logic Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated logic to handle lists and updated docs Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Transferred dynamic freezing logic to core from asr Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert asr config to original Signed-off-by: Daniel Egert * Fixed tab indent in core.rst Signed-off-by: Daniel Egert * Updated modelPT for latest from master Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed indents in docs Signed-off-by: Daniel Egert --------- Signed-off-by: Daniel Egert Co-authored-by: Daniel Egert Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * Fix Windows bug with save_restore_connector (#5919) * Initial commit for Windows bug with save_to Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Daniel Egert Co-authored-by: Daniel Egert Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jason * add new lannguages to doc (#5939) Signed-off-by: Yang Zhang Signed-off-by: Jason * Workarounds for ONNX export with autocast Signed-off-by: Boris Fomitchev Signed-off-by: Jason * fix val loss computation in megatron (#5871) * fix val loss computation in megatron * Fix NaN handling during validation --------- Co-authored-by: ANMOL GUPTA Co-authored-by: Mikołaj Błaż Co-authored-by: Eric Harper Signed-off-by: Jason * Restoring sigmas Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Add core classes and functions for online clustering diarizer part 2 (#5609) * Add core classes and functions for online clustering diarizer Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add audio to labels code Signed-off-by: Taejin Park * resolve type errors Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added unit=tests for very short audio Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Filled all missing docstrings Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * resolved conflict and added missing docstrings Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed unit-test errors Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix the wrongly added file - megatron_gpt_model.py Signed-off-by: Taejin Park * Fix wrongly included file - megatron_gpt_model.py Signed-off-by: Taejin Park * resolve code quality issue Signed-off-by: Taejin Park * Fixed unit-test errors and bugs Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * changed total_sec for offline_clustering toy_data in unit-tests Signed-off-by: Taejin Park * fixed merging index offset bug Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * only including part 1 files Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed unused function Signed-off-by: Taejin Park * fixed unused imports Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * divided nmesc_clustering.py into two and reflected first-pass comments Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adding offline/online_clustering.py Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix code QL autocomment Signed-off-by: Taejin Park * Removed unused imports Signed-off-by: Taejin Park * Update nemo/collections/asr/parts/utils/online_clustering.py Co-authored-by: Sean Naren Signed-off-by: Taejin Park * Reflected comments Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * resolved code scanning issue Signed-off-by: Taejin Park * Adding online_diarizer.py Signed-off-by: Taejin Park * updated tests and speaker_utils Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed the wrong test eval Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updating online diarizer for varialbe name change Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reflected comments and some typo fixes in speaker_utils Signed-off-by: Taejin Park * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Taejin Park Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nithin Rao Co-authored-by: Sean Naren Signed-off-by: Jason * Distributed Adam optimizer overlaps param all-gather with forward compute (#5684) * Add distopt support for overlapping param all-gather with forward compute Signed-off-by: Tim Moon * Update Apex commit Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: Eric Harper Signed-off-by: Jason * [TTS][ZH] added new NGC model cards with polyphone disambiguation. (#5940) * [TTS][ZH] added new NGC model cards with polyphone disambiguation. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * Moved truncation of context higher up Signed-off-by: Boris Fomitchev Signed-off-by: Jason * [TN] bugfix file handler is not closed. (#5955) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * Added unit test for regulate_len. Unscripted sort_tensor for TRT Signed-off-by: Boris Fomitchev Signed-off-by: Jason * Fixed slice Signed-off-by: Boris Fomitchev Signed-off-by: Jason * [TTS] deprecate AudioToCharWithPriorAndPitchDataset. (#5959) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * bugfix: file handlers are not closed. (#5956) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * [TTS][G2P] deprecate add_symbols (#5961) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * fix broken link (#5968) Signed-off-by: ericharper Signed-off-by: Jason * Fix hybridasr bug (#5950) (#5957) Signed-off-by: Jason * Added list_available_models (#5967) * Added list_available_models Signed-off-by: Evgeniy Shabalin <36159472+treacker@users.noreply.github.com> * Added to readme Signed-off-by: Evgeniy Shabalin * added vits to docs Signed-off-by: Evgeniy Shabalin * added vits to docs Signed-off-by: Evgeniy Shabalin --------- Signed-off-by: Evgeniy Shabalin <36159472+treacker@users.noreply.github.com> Signed-off-by: Evgeniy Shabalin Signed-off-by: Jason * Move settings to `pyproject.toml`. Remove deprecated `pytest-runner` (#5947) * Move project settings to pyproject.toml Signed-off-by: Vladimir Bataev * Remove setup.cfg Signed-off-by: Vladimir Bataev * Remove deprecated pytest-runner Signed-off-by: Vladimir Bataev * Add comments Signed-off-by: Vladimir Bataev * Allow only registered markers for pytest Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev Signed-off-by: Jason * Fix torchaudio installation (#5850) * Fail if torchaudio not installed Signed-off-by: Vladimir Bataev * Fix torchaudio matching version Signed-off-by: Vladimir Bataev * Warn if Pytorch major version changed Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev Signed-off-by: Jason * Update fastpitch.py (#5969) Signed-off-by: Jason * Review comments Signed-off-by: Boris Fomitchev Signed-off-by: Jason * per-micro-batch input loader (#5635) * per-micro-batch input loader * per-micro-batch input loader set arg default val * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fix * apply per-microbatch-loader to only GPT * update docstring on micro-batch input loader * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed the default arg val * fix batch size to 1 at log stat registration * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update container for CI Signed-off-by: ericharper * update container in jenkinsfile Signed-off-by: ericharper * update container for CI Signed-off-by: ericharper fix merge conflict * revert Jenkinsfile * Revert "revert Jenkinsfile" This reverts commit d23b7757e0f935dacde2840f234193c632a2b3be. * Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * add GradScaler * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ericharper Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: Jason * update container in readme (#5981) Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: Jason * Support Alignment Extraction for all RNNT Beam decoding methods (#5925) * Partial impl of ALSD alignment extraction Signed-off-by: smajumdar * Partial impl of ALSD alignment extraction Signed-off-by: smajumdar * Remove everything else Signed-off-by: smajumdar * Support dataclass in AbstractRNNTDecoding Signed-off-by: smajumdar * Add first draft unittest Signed-off-by: smajumdar * Correct the logic to more to the next timestep in the alignment Signed-off-by: smajumdar * Finalize ALSD alignment generation Signed-off-by: smajumdar * Add support for TSD greedy alignment extraction Signed-off-by: smajumdar * Add support for mAES greedy alignment extraction Signed-off-by: smajumdar * Finalize extraction of alignments from all beam algorithms for RNNT Signed-off-by: smajumdar * Style fixes Signed-off-by: smajumdar * Add copyright Signed-off-by: smajumdar * Address comments Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Signed-off-by: Jason * Add AWS SageMaker ASR Examples (#5638) * Base code for AWS SageMaker example Signed-off-by: SeanNaren * Remove format Signed-off-by: SeanNaren * wrap Signed-off-by: SeanNaren * Add a notebook with the code Signed-off-by: SeanNaren * Setup Signed-off-by: SeanNaren * Update notebook Signed-off-by: SeanNaren * Remove space Signed-off-by: SeanNaren * Fix spelling mistake Signed-off-by: SeanNaren * Add message to explain usage Signed-off-by: SeanNaren * Add CommonVoice esperanto example Signed-off-by: SeanNaren * Fix path Signed-off-by: SeanNaren * Fixes Signed-off-by: SeanNaren * Import sox locally, add documentation Signed-off-by: SeanNaren * Address reviews Signed-off-by: SeanNaren * Address reviews Signed-off-by: SeanNaren * Address reviews Signed-off-by: SeanNaren * Add cell to download the SSL model Signed-off-by: SeanNaren * Set max epochs to 300 Signed-off-by: SeanNaren * Fixes, introduce HF dataset instructions Signed-off-by: SeanNaren * Upstream updates from other branch Signed-off-by: SeanNaren * Fix warning Signed-off-by: SeanNaren * Add README, add image Signed-off-by: SeanNaren * Fix warning Signed-off-by: SeanNaren * Address feedback Signed-off-by: SeanNaren * Feedback Signed-off-by: SeanNaren --------- Signed-off-by: SeanNaren Signed-off-by: Jason * Update PUBLICATIONS.md (#5963) * Add papers from 2022/2022 to PUBLICATIONS.md Signed-off-by: smajumdar * Remove ipynb from being tracked as for nemo code library Signed-off-by: smajumdar * Remove ipynb from being tracked as for nemo code library Signed-off-by: smajumdar * Add additional papers Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Signed-off-by: Jason * [G2P] fixed typos and broken import library. (#5978) (#5979) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason * [G2P] added backward compatibility for english tokenizer and fixed unit tests (#5980) (#5984) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Jason --------- Signed-off-by: Micha Livne Signed-off-by: Jason Signed-off-by: Matvei Novikov Signed-off-by: nithinraok Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: fayejf Signed-off-by: Boris Fomitchev Signed-off-by: stevehuang52 Signed-off-by: Taejin Park Signed-off-by: Yi Dong Signed-off-by: Roman Korostik Signed-off-by: Roman Korostik Signed-off-by: Tim Moon Signed-off-by: Jean-Louis Queguiner Signed-off-by: ekmb Signed-off-by: Vladimir Bataev Signed-off-by: Jocelyn Huang Signed-off-by: SeanNaren Signed-off-by: gabitza-tech Signed-off-by: ericharper Signed-off-by: athitten Signed-off-by: Ante Jukić Signed-off-by: smajumdar Signed-off-by: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> Signed-off-by: MaximumEntropy Signed-off-by: Micha Livne Signed-off-by: Mohamed Saad Ibn Seddik Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: David Mosallanezhad Signed-off-by: Daniel Egert Signed-off-by: Yang Zhang Signed-off-by: Evgeniy Shabalin <36159472+treacker@users.noreply.github.com> Signed-off-by: Evgeniy Shabalin Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Micha Livne Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Micha Livne Co-authored-by: Matvei Novikov Co-authored-by: Nithin Rao Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Taejin Park Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Co-authored-by: Roman Korostik Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Jean-Louis Queguiner Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: Oleksii Kuchaiev Co-authored-by: Vladimir Bataev Co-authored-by: Mikyas Desta Co-authored-by: Jocelyn Co-authored-by: Sean Naren Co-authored-by: Gabriel Pirlogeanu <53811655+gabitza-tech@users.noreply.github.com> Co-authored-by: anmolgupt <14880251+anmolgupt@users.noreply.github.com> Co-authored-by: ANMOL GUPTA Co-authored-by: Eric Harper Co-authored-by: Zhilin Wang Co-authored-by: athitten <47577437+athitten@users.noreply.github.com> Co-authored-by: anteju <108555623+anteju@users.noreply.github.com> Co-authored-by: Somshubra Majumdar Co-authored-by: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Co-authored-by: Micha Livne Co-authored-by: Mohamed Saad Ibn Seddik Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: David Co-authored-by: David Mosallanezhad Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com> Co-authored-by: Daniel Egert Co-authored-by: Yang Zhang Co-authored-by: Mikołaj Błaż Co-authored-by: Evgeniy Shabalin <36159472+treacker@users.noreply.github.com> Co-authored-by: Jason Co-authored-by: Sangkug Lym --- examples/tts/conf/rad-tts_dec_ipa.yaml | 275 ++++++++++++++ .../tts/conf/rad-tts_feature_pred_ipa.yaml | 341 ++++++++++++++++++ nemo/collections/tts/helpers/helpers.py | 43 +-- nemo/collections/tts/models/radtts.py | 27 +- .../tts/modules/attribute_prediction_model.py | 11 +- nemo/collections/tts/modules/common.py | 88 ++--- nemo/collections/tts/modules/radtts.py | 65 ++-- nemo/collections/tts/modules/submodules.py | 2 +- nemo/collections/tts/modules/transformer.py | 50 +++ tests/collections/tts/test_helpers.py | 43 +++ tests/collections/tts/test_tts_exportables.py | 11 +- 11 files changed, 816 insertions(+), 140 deletions(-) create mode 100644 examples/tts/conf/rad-tts_dec_ipa.yaml create mode 100644 examples/tts/conf/rad-tts_feature_pred_ipa.yaml create mode 100644 tests/collections/tts/test_helpers.py diff --git a/examples/tts/conf/rad-tts_dec_ipa.yaml b/examples/tts/conf/rad-tts_dec_ipa.yaml new file mode 100644 index 000000000000..b251537daad8 --- /dev/null +++ b/examples/tts/conf/rad-tts_dec_ipa.yaml @@ -0,0 +1,275 @@ +name: RadTTS +sample_rate: 22050 + +train_dataset: ??? +validation_datasets: ??? +ckpt_path: None +export_dir: ??? +sup_data_path: ??? +sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] + + + +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech + +# default values from librosa.pyin +pitch_fmin: 65.40639132514966 +pitch_fmax: 2093.004522404789 + +# default values for sample_rate=22050 +n_mels: 80 +n_window_size: 1024 +n_window_stride: 256 +n_fft: 1024 +lowfreq: 0 +highfreq: 8000 +window: "hann" + + +phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" +mapping_file_path: "" + +model: + target: nemo.collections.tts.models.RadTTSModel + bin_loss_start_ratio: 0.2 + bin_loss_warmup_epochs: 100 + + symbols_embedding_dim: 384 + n_mel_channels: ${n_mels} + + pitch_mean: ${pitch_mean} + pitch_std: ${pitch_std} + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: en + input_case: cased + whitelist: ${whitelist_path} + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo_text_processing.g2p.modules.IPAG2P + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + # Relies on the heteronyms list for anything that needs to be disambiguated + ignore_ambiguous_words: true + use_chars: true + use_stresses: true + + train_ds: + dataset: + _target_: "nemo.collections.tts.torch.data.TTSDataset" + manifest_filepath: ${train_dataset} + sample_rate: ${sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${n_fft} + win_length: ${n_window_size} + hop_length: ${n_window_stride} + window: ${window} + n_mels: ${n_mels} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: False + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + + + text_tokenizer: + _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" + punct: True + stresses: True + chars: True + space: ' ' + silence: null + apostrophe: True + sep: '|' + add_blank_at: null + pad_with_space: True + g2p: + _target_: "nemo_text_processing.g2p.modules.EnglishG2p" + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + dataloader_params: + drop_last: false + shuffle: true + batch_size: 8 + num_workers: 8 + pin_memory: false + + validation_ds: + dataset: + _target_: "nemo.collections.tts.torch.data.TTSDataset" + manifest_filepath: ${validation_datasets} + sample_rate: ${sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${n_fft} + win_length: ${n_window_size} + hop_length: ${n_window_stride} + window: ${window} + n_mels: ${n_mels} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: False + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + text_tokenizer: + _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" + punct: True + stresses: True + chars: True + space: ' ' + silence: null + apostrophe: True + sep: '|' + add_blank_at: null + pad_with_space: True + g2p: + _target_: "nemo_text_processing.g2p.modules.EnglishG2p" + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + dataloader_params: + drop_last: false + shuffle: false + batch_size: 8 + num_workers: 8 + pin_memory: false + + optim: + name: RAdam + lr: 0.0001 + betas: [0.9, 0.98] + weight_decay: 0.000001 + + sched: + name: exp_decay + warmup_steps: 40000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + trainerConfig: + sigma: 1 + iters_per_checkpoint: 3000 + seed: null + ignore_layers: [] + finetune_layers: [] + include_layers: [] + with_tensorboard: true + dur_loss_weight: 1 + ctc_loss_weight: 1 + mask_unvoiced_f0: false + log_step: 1 + binarization_start_iter: 6000 + kl_loss_start_iter: 18000 + loss_weights: + ctc_loss_weight: 0.1 + dur_loss_weight: 1.0 + f0_loss_weight: 1.0 + energy_loss_weight: 1.0 + vpred_loss_weight: 1.0 + unfreeze_modules: "all" + + load_from_checkpoint: False + init_from_ptl_ckpt: ${ckpt_path} + modelConfig: + _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" + n_speakers: 1 + n_speaker_dim: 16 + n_text: 384 #185 + n_text_dim: 512 + n_flows: 8 + n_conv_layers_per_step: 4 + n_mel_channels: 80 + n_hidden: 1024 + mel_encoder_n_hidden: 512 + dummy_speaker_embedding: false + n_early_size: 2 + n_early_every: 2 + n_group_size: 2 + affine_model: wavenet + include_modules: "decatnvpred" + scaling_fn: tanh + matrix_decomposition: LUS + learn_alignments: true + use_context_lstm: true + context_lstm_norm: spectral + context_lstm_w_f0_and_energy: true + text_encoder_lstm_norm: spectral + n_f0_dims: 1 + n_energy_avg_dims: 1 + use_first_order_features: false + unvoiced_bias_activation: "relu" + decoder_use_partial_padding: false + decoder_use_unvoiced_bias: true + ap_pred_log_f0: true + ap_use_unvoiced_bias: true + ap_use_voiced_embeddings: true + dur_model_config: null + f0_model_config: null + energy_model_config: null + v_model_config : + name : dap + hparams : + n_speaker_dim : 16 + take_log_of_input: false + bottleneck_hparams: + in_dim: 512 + reduction_factor: 16 + norm: weightnorm + non_linearity: relu + arch_hparams: + out_dim: 1 + n_layers: 2 + n_channels: 256 + kernel_size: 3 + p_dropout: 0.5 + +trainer: + devices: 8 + precision: 16 + max_epochs: 1000 + num_nodes: 1 + accelerator: gpu + strategy: ddp + accumulate_grad_batches: 1 + enable_checkpointing: False + logger: False + gradient_clip_val: 1 + log_every_n_steps: 100 + check_val_every_n_epoch: 5 + +exp_manager: + exp_dir: ${export_dir} + name: ${name} + create_tensorboard_logger: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val/loss_ctc + mode: min + filepath: ${export_dir} + filename: model_checkpoint diff --git a/examples/tts/conf/rad-tts_feature_pred_ipa.yaml b/examples/tts/conf/rad-tts_feature_pred_ipa.yaml new file mode 100644 index 000000000000..cf36d7ad9506 --- /dev/null +++ b/examples/tts/conf/rad-tts_feature_pred_ipa.yaml @@ -0,0 +1,341 @@ +name: RadTTS +sample_rate: 22050 + +train_dataset: ??? +validation_datasets: ??? +ckpt_path: ??? +export_dir: ??? +sup_data_path: ??? +sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] + + +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech + +# default values from librosa.pyin +pitch_fmin: 65.40639132514966 +pitch_fmax: 2093.004522404789 + +# default values for sample_rate=22050 +n_mels: 80 +n_window_size: 1024 +n_window_stride: 256 +n_fft: 1024 +lowfreq: 0 +highfreq: 8000 +window: "hann" + +phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" +whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" +mapping_file_path: "" + +model: + target: nemo.collections.tts.models.RadTTSModel + bin_loss_start_ratio: 0.2 + bin_loss_warmup_epochs: 100 + + symbols_embedding_dim: 384 + n_mel_channels: ${n_mels} + + pitch_mean: ${pitch_mean} + pitch_std: ${pitch_std} + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: en + input_case: cased + whitelist: ${whitelist_path} + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo_text_processing.g2p.modules.IPAG2P + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + # Relies on the heteronyms list for anything that needs to be disambiguated + ignore_ambiguous_words: true + use_chars: true + use_stresses: true + + train_ds: + dataset: + _target_: "nemo.collections.tts.torch.data.TTSDataset" + manifest_filepath: ${train_dataset} + sample_rate: ${sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${n_fft} + win_length: ${n_window_size} + hop_length: ${n_window_stride} + window: ${window} + n_mels: ${n_mels} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: False + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + + + text_tokenizer: + _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" + punct: True + stresses: True + chars: True + space: ' ' + silence: null + apostrophe: True + sep: '|' + add_blank_at: null + pad_with_space: True + g2p: + _target_: "nemo_text_processing.g2p.modules.EnglishG2p" + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + dataloader_params: + drop_last: false + shuffle: true + batch_size: 32 + num_workers: 8 + pin_memory: True + + validation_ds: + dataset: + _target_: "nemo.collections.tts.torch.data.TTSDataset" + manifest_filepath: ${validation_datasets} + sample_rate: ${sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${n_fft} + win_length: ${n_window_size} + hop_length: ${n_window_stride} + window: ${window} + n_mels: ${n_mels} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: False + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + text_tokenizer: + _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" + punct: True + stresses: True + chars: True + space: ' ' + silence: null + apostrophe: True + sep: '|' + add_blank_at: null + pad_with_space: True + g2p: + _target_: "nemo_text_processing.g2p.modules.EnglishG2p" + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + dataloader_params: + drop_last: false + shuffle: false + batch_size: 32 + num_workers: 8 + pin_memory: True + + optim: + name: RAdam + lr: 0.001 + betas: [0.9, 0.98] + weight_decay: 0.000001 + + sched: + name: exp_decay + warmup_steps: 40000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + trainerConfig: + sigma: 1 + iters_per_checkpoint: 3000 + seed: null + ignore_layers: [] + finetune_layers: [] + include_layers: [] + with_tensorboard: true + dur_loss_weight: 1 + ctc_loss_weight: 1 + mask_unvoiced_f0: false + log_step: 1 + binarization_start_iter: 1000000 + kl_loss_start_iter: 1000000 + loss_weights: + ctc_loss_weight: 0.1 + dur_loss_weight: 1.0 + f0_loss_weight: 1.0 + energy_loss_weight: 1.0 + vpred_loss_weight: 1.0 + unfreeze_modules: "durf0energyvpred" + + load_from_checkpoint: True + init_from_ptl_ckpt: ${ckpt_path} + modelConfig: + _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" + n_speakers: 1 + n_speaker_dim: 16 + n_text: 384 #185 + n_text_dim: 512 + n_flows: 8 + n_conv_layers_per_step: 4 + n_mel_channels: 80 + n_hidden: 1024 + mel_encoder_n_hidden: 512 + n_components: 0 + mean_scale: 0 + fixed_gaussian: true + dummy_speaker_embedding: false + use_positional_embedding: false + n_early_size: 2 + n_early_every: 2 + n_group_size: 2 + use_feature_gating: false + affine_model: wavenet + include_modules: "decatnunvbiasdpmvpredapm" + what_to_train: decatnunvbias + scaling_fn: tanh + reduction_norm: "" + matrix_decomposition: LUS + learn_alignments: true + use_query_proj: true + align_query_enc_type: 3xconv + lstm_applicable_steps: [] + use_context_lstm: true + context_lstm_norm: spectral + context_lstm_w_f0_and_energy: true + text_encoder_lstm_norm: spectral + use_text_conditional_priors: false + zero_out_context: false + n_aug_dims: 6 + n_f0_dims: 1 + n_energy_avg_dims: 1 + use_first_order_features: false + unvoiced_bias_activation: "relu" + decoder_use_partial_padding: false + decoder_use_unvoiced_bias: true + ap_pred_log_f0: true + ap_use_unvoiced_bias: true + ap_use_voiced_embeddings: true + p_dropout: 0.1 + noise_to_unvoiced_in_f0: 0 + noise_to_pvoiced: 0 + dur_model_config: + name: dap + hparams: + n_speaker_dim: 16 + bottleneck_hparams: + in_dim: 512 + reduction_factor: 16 + norm: weightnorm + non_linearity: relu + take_log_of_input: true + use_transformer: true + arch_hparams: + out_dim: 1 + n_layers: 3 + n_head: 1 + d_head: 64 + d_inner: 1024 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0 + in_dim: 48 + f0_model_config: + name: dap + hparams: + n_speaker_dim: 16 + bottleneck_hparams: + in_dim: 512 + reduction_factor: 16 + norm: weightnorm + non_linearity: relu + take_log_of_input: false + arch_hparams: + out_dim: 1 + n_layers: 2 + n_channels: 256 + kernel_size: 11 + p_dropout: 0.5 + + energy_model_config: + name: dap + hparams: + n_speaker_dim: 16 + bottleneck_hparams: + in_dim: 512 + reduction_factor: 16 + norm: weightnorm + non_linearity: relu + take_log_of_input: false + arch_hparams: + out_dim: 1 + n_layers: 2 + n_channels: 256 + kernel_size: 3 + p_dropout: 0.5 + v_model_config : + name: dap + hparams: + n_speaker_dim: 16 + take_log_of_input: false + bottleneck_hparams: + in_dim: 512 + reduction_factor: 16 + norm: weightnorm + non_linearity: relu + arch_hparams: + out_dim: 1 + n_layers: 2 + n_channels: 256 + kernel_size: 3 + p_dropout: 0.5 + +trainer: + devices: 8 + precision: 16 + max_epochs: 1000 + num_nodes: 1 + accelerator: gpu + strategy: ddp + accumulate_grad_batches: 1 + enable_checkpointing: False + logger: False + gradient_clip_val: 1 + log_every_n_steps: 100 + check_val_every_n_epoch: 2 + +exp_manager: + exp_dir: ${export_dir} + name: ${name} + create_tensorboard_logger: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val/loss_energy + mode: min + filepath: ${export_dir} + filename: model_checkpoint diff --git a/nemo/collections/tts/helpers/helpers.py b/nemo/collections/tts/helpers/helpers.py index 535a33cf92e9..e7e92973910f 100644 --- a/nemo/collections/tts/helpers/helpers.py +++ b/nemo/collections/tts/helpers/helpers.py @@ -155,14 +155,17 @@ def get_mask_from_lengths(lengths: Optional[torch.Tensor] = None, x: Optional[to return mask -@torch.jit.script -def sort_tensor(context: torch.Tensor, lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - lens_sorted, ids_sorted = torch.sort(lens, descending=True) - unsort_ids = torch.zeros_like(ids_sorted) - for i in range(ids_sorted.shape[0]): - unsort_ids[ids_sorted[i]] = i - context = context[ids_sorted] - return context, lens_sorted, unsort_ids +def sort_tensor( + context: torch.Tensor, lens: torch.Tensor, dim: Optional[int] = 0, descending: Optional[bool] = True +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + lens_sorted, ids_sorted = torch.sort(lens, descending=descending) + context = torch.index_select(context, dim, ids_sorted) + return context, lens_sorted, ids_sorted + + +def unsort_tensor(ordered: torch.Tensor, indices: torch.Tensor, dim: Optional[int] = 0) -> torch.Tensor: + unsort_ids = indices.gather(0, indices.argsort(0)) + return torch.index_select(ordered, dim, unsort_ids) @jit(nopython=True) @@ -520,13 +523,7 @@ def remove(conv_list): def regulate_len( - durations, - enc_out, - pace=1.0, - mel_max_len=None, - replicate_to_nearest_multiple=False, - group_size=2, - in_lens: torch.tensor = None, + durations, enc_out, pace=1.0, mel_max_len=None, group_size=1, dur_lens: torch.tensor = None, ): """A function that takes predicted durations per encoded token, and repeats enc_out according to the duration. NOTE: durations.shape[1] == enc_out.shape[1] @@ -537,22 +534,20 @@ def regulate_len( enc_out (torch.tensor): A tensor of shape (batch x enc_length x enc_hidden) that represents the encoded tokens. pace (float): The pace of speaker. Higher values result in faster speaking pace. Defaults to 1.0. max_mel_len (int): The maximum length above which the output will be removed. If sum(durations, dim=1) > max_mel_len, the values after max_mel_len will be removed. Defaults to None, which has no max length. - replicate_to_nearest_multiple (bool): replicate the last element specified by durations[i, in_lens[i] - 1] until the + group_size (int): replicate the last element specified by durations[i, in_lens[i] - 1] until the full length of the sequence is the next nearest multiple of group_size - group_size (int): factor used by replicate_to_nearest_multiple - in_lens (torch.tensor): input sequence length specifying valid values in the durations input tensor + in_lens (torch.tensor): input sequence length specifying valid values in the durations input tensor (only needed if group_size >1) """ dtype = enc_out.dtype reps = durations.float() / pace reps = (reps + 0.5).floor().long() dec_lens = reps.sum(dim=1) - - if replicate_to_nearest_multiple: - to_pad = group_size * (torch.div(dec_lens, group_size, rounding_mode='floor') + 1) - dec_lens - to_pad = to_pad.unsqueeze(-1).repeat(1, reps.shape[1]) - to_pad_expanded = torch.zeros_like(reps).scatter_(1, in_lens.unsqueeze(-1).long() - 1, to_pad) - reps = reps + to_pad_expanded + if group_size > 1: + to_pad = group_size * (torch.div(dec_lens + 1, group_size, rounding_mode='floor')) - dec_lens + reps.index_put_( + indices=[torch.arange(dur_lens.shape[0], dtype=torch.long), dur_lens - 1], values=to_pad, accumulate=True + ) dec_lens = reps.sum(dim=1) max_len = dec_lens.max() diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py index e303c924f4b2..1612af913f09 100644 --- a/nemo/collections/tts/models/radtts.py +++ b/nemo/collections/tts/models/radtts.py @@ -424,7 +424,6 @@ def _prepare_for_export(self, **kwargs): self._input_types = { "text": NeuralType(('B', 'T'), TokenIndex()), "lens": NeuralType(('B')), - # "batch_lengths": NeuralType(('B'), LengthsType(), optional=True), "speaker_id": NeuralType(('B'), Index()), "speaker_id_text": NeuralType(('B'), Index()), "speaker_id_attributes": NeuralType(('B'), Index()), @@ -443,12 +442,13 @@ def input_example(self, max_batch=1, max_dim=400): par = next(self.parameters()) sz = (max_batch, max_dim) # sz = (max_batch * max_dim,) - inp = torch.randint(0, 94, sz, device=par.device, dtype=torch.int64) + # Pick up only pronouncible tokens + inp = torch.randint(32, 64, sz, device=par.device, dtype=torch.int64) speaker = torch.randint(0, 1, (max_batch,), device=par.device, dtype=torch.int64) pitch = torch.randn(sz, device=par.device, dtype=torch.float32) * 0.5 - pace = torch.clamp(torch.randn(sz, device=par.device, dtype=torch.float32) * 0.1 + 1, min=0.01) - volume = torch.clamp(torch.randn(sz, device=par.device, dtype=torch.float32) * 0.1 + 1, min=0.01) - # batch_lengths = torch.zeros((max_batch + 1), device=par.device, dtype=torch.int32) + pace = torch.clamp(torch.randn(sz, device=par.device, dtype=torch.float32) * 0.1 + 1, min=0.2, max=2.0) + volume = torch.clamp(torch.randn(sz, device=par.device, dtype=torch.float32) * 0.1 + 1, min=0.2, max=2.0) + # batch_lengths = torch.zeros((max_batch + 1), device=par.device, dtype=torch.int64) # left_over_size = sz[0] # batch_lengths[0] = 0 # for i in range(1, max_batch): @@ -468,10 +468,11 @@ def input_example(self, max_batch=1, max_dim=400): lens = [] for i, _ in enumerate(inp): - len_i = random.randint(3, max_dim) + len_i = random.randint(64, max_dim) lens.append(len_i) - inp[i, len_i:] = pad_id - lens = torch.tensor(lens, device=par.device, dtype=torch.int) + # inp[i, len_i:] = pad_id + lens = torch.tensor(lens, device=par.device, dtype=torch.int32) + lens[0] = max_dim inputs = { 'text': inp, @@ -489,13 +490,14 @@ def input_example(self, max_batch=1, max_dim=400): def forward_for_export( self, text, lens, speaker_id, speaker_id_text, speaker_id_attributes, pitch, pace, volume, ): + lens = lens.to(dtype=torch.int64) (mel, n_frames, dur, _, _) = self.model.infer( speaker_id, text, speaker_id_text=speaker_id_text, speaker_id_attributes=speaker_id_attributes, - sigma=0.0, - sigma_txt=0.0, + sigma=0.7, + sigma_txt=0.7, sigma_f0=1.0, sigma_energy=1.0, f0_mean=0.0, @@ -511,9 +513,8 @@ def forward_for_export( durs_predicted, volume[:, :truncated_length].unsqueeze(-1), pace[:, :truncated_length], - replicate_to_nearest_multiple=True, group_size=self.model.n_group_size, - in_lens=lens, + dur_lens=lens, ) - volume_extended = volume_extended.squeeze(-1).float() + volume_extended = volume_extended.squeeze(2).float() return mel.float(), n_frames, dur.float(), volume_extended diff --git a/nemo/collections/tts/modules/attribute_prediction_model.py b/nemo/collections/tts/modules/attribute_prediction_model.py index d6595c94efea..ddc45045e831 100644 --- a/nemo/collections/tts/modules/attribute_prediction_model.py +++ b/nemo/collections/tts/modules/attribute_prediction_model.py @@ -19,6 +19,7 @@ from nemo.collections.tts.helpers.helpers import get_mask_from_lengths from nemo.collections.tts.modules.common import ConvLSTMLinear from nemo.collections.tts.modules.submodules import ConvNorm, MaskedInstanceNorm1d +from nemo.collections.tts.modules.transformer import FFTransformer def get_attribute_prediction_model(config): @@ -68,7 +69,8 @@ def __init__(self, in_dim, reduction_factor, norm='weightnorm', non_linearity='r def forward(self, x, lens): if self.reduction_factor > 1: - mask = get_mask_from_lengths(lens, x).unsqueeze(1) + # borisf: here, float() instead of to(x.dtype) to work arounf ONNX exporter bug + mask = get_mask_from_lengths(lens, x).unsqueeze(1).float() x = self.projection_fn(x, mask) if self.non_linearity == 'relu': x = F.relu(x) @@ -78,11 +80,14 @@ def forward(self, x, lens): class DAP(AttributeProcessing): - def __init__(self, n_speaker_dim, bottleneck_hparams, take_log_of_input, arch_hparams): + def __init__(self, n_speaker_dim, bottleneck_hparams, take_log_of_input, arch_hparams, use_transformer=False): super(DAP, self).__init__(take_log_of_input) self.bottleneck_layer = BottleneckLayerLayer(**bottleneck_hparams) arch_hparams['in_dim'] = self.bottleneck_layer.out_dim + n_speaker_dim - self.feat_pred_fn = ConvLSTMLinear(**arch_hparams) + if use_transformer: + self.feat_pred_fn = FFTransformer(**arch_hparams) + else: + self.feat_pred_fn = ConvLSTMLinear(**arch_hparams) def forward(self, txt_enc, spk_emb, x, lens): if x is not None: diff --git a/nemo/collections/tts/modules/common.py b/nemo/collections/tts/modules/common.py index 0c3dbed24e84..2b793c15116a 100644 --- a/nemo/collections/tts/modules/common.py +++ b/nemo/collections/tts/modules/common.py @@ -24,7 +24,7 @@ from torch.nn import functional as F from torch.nn.utils.rnn import PackedSequence -from nemo.collections.tts.helpers.helpers import get_mask_from_lengths, sort_tensor +from nemo.collections.tts.helpers.helpers import get_mask_from_lengths, sort_tensor, unsort_tensor from nemo.collections.tts.helpers.splines import ( piecewise_linear_inverse_transform, piecewise_linear_transform, @@ -82,35 +82,37 @@ def __init__(self, input_size, hidden_size, num_layers=1, lstm_norm_fn="spectral self.bilstm.flatten_parameters() - def lstm_tensor(self, context: Tensor, lens: Tensor, enforce_sorted: bool = False) -> Tuple[Tensor, Tensor]: - seq = nn.utils.rnn.pack_padded_sequence( - context, lens.long().cpu(), batch_first=True, enforce_sorted=enforce_sorted - ) - return self.lstm_sequence(seq) + def lstm_sorted(self, context: Tensor, lens: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tensor: + seq = nn.utils.rnn.pack_padded_sequence(context, lens.long().cpu(), batch_first=True, enforce_sorted=True) + ret, _ = self.bilstm(seq, hx) + return nn.utils.rnn.pad_packed_sequence(ret, batch_first=True)[0] - def lstm_sequence(self, seq: PackedSequence) -> Tuple[Tensor, Tensor]: - if not (torch.jit.is_scripting() or torch.jit.is_tracing()): - self.bilstm.flatten_parameters() - ret, _ = self.bilstm(seq) - else: + def lstm(self, context: Tensor, lens: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tensor: + # To be ONNX-exportable, we need to sort here rather that while packing + context, lens, unsort_ids = sort_tensor(context, lens) + ret = self.lstm_sorted(context, lens, hx=hx) + return unsort_tensor(ret, unsort_ids) + + def lstm_nocast(self, context: Tensor, lens: Tensor) -> Tensor: + dtype = context.dtype + # autocast guard is only needed for Torchscript to run in Triton + # (https://github.com/pytorch/pytorch/issues/89241) + with torch.cuda.amp.autocast(enabled=False): # Calculate sizes and prepare views to our zero buffer to pass as hx - max_batch_size = seq.batch_sizes[0] + max_batch_size = context.shape[0] + context = context.to(dtype=torch.float32) common_shape = (self.bilstm.num_layers * 2, max_batch_size) hx = ( - seq.data.new_zeros(*common_shape, self.real_hidden_size), - seq.data.new_zeros(*common_shape, self.bilstm.hidden_size), + context.new_zeros(*common_shape, self.real_hidden_size), + context.new_zeros(*common_shape, self.bilstm.hidden_size), ) - ret, _ = self.bilstm(seq, hx) - return nn.utils.rnn.pad_packed_sequence(ret, batch_first=True) + return self.lstm(context, lens, hx=hx).to(dtype=dtype) def forward(self, context: Tensor, lens: Tensor) -> Tensor: - context, lens, unsort_ids = sort_tensor(context, lens) - dtype = context.dtype - # this is only needed for Torchscript to run in Triton - # (https://github.com/pytorch/pytorch/issues/89241) - with torch.cuda.amp.autocast(enabled=False): - ret = self.lstm_tensor(context.to(dtype=torch.float32), lens, enforce_sorted=True) - return ret[0].to(dtype=dtype)[unsort_ids] + self.bilstm.flatten_parameters() + if torch.jit.is_tracing(): + return self.lstm_nocast(context, lens) + return self.lstm(context, lens) class ConvLSTMLinear(nn.Module): @@ -162,9 +164,8 @@ def forward(self, context: Tensor, lens: Tensor) -> Tensor: mask = mask.to(dtype=context.dtype).unsqueeze(1) for conv in self.convolutions: context = self.dropout(F.relu(conv(context, mask))) - context = context.transpose(1, 2) # Apply Bidirectional LSTM - context = self.bilstm(context, lens) + context = self.bilstm(context.transpose(1, 2), lens=lens) if self.dense is not None: context = self.dense(context).permute(0, 2, 1) return context @@ -211,12 +212,9 @@ def forward(self, z, inverse=False): if inverse: if not hasattr(self, 'W_inverse'): # inverse computation - W_inverse = W.float().inverse() - if z.type() == 'torch.cuda.HalfTensor': - W_inverse = W_inverse.half() - + W_inverse = W.float().inverse().to(dtype=z.dtype) self.W_inverse = W_inverse[..., None] - z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + z = F.conv1d(z, self.W_inverse.to(dtype=z.dtype), bias=None, stride=1, padding=0) return z else: W = W[..., None] @@ -252,10 +250,7 @@ def forward(self, z, inverse=False): if inverse: if not hasattr(self, 'W_inverse'): # Inverse computation - W_inverse = W.float().inverse() - if z.type() == 'torch.cuda.HalfTensor': - W_inverse = W_inverse.half() - + W_inverse = W.float().inverse().to(dtype=z.dtype) self.W_inverse = W_inverse[..., None] z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) return z @@ -717,31 +712,6 @@ def __init__(self, n_mel_channels=80, n_speaker_dim=128, n_text_channels=512, n_ ConvNorm(n_mel_channels, n_att_channels, kernel_size=1, bias=True), ) - def run_padded_sequence(self, sorted_idx, unsort_idx, lens, padded_data, recurrent_model): - """Sorts input data by previded ordering (and un-ordering) and runs the - packed data through the recurrent model - - Args: - sorted_idx (torch.tensor): 1D sorting index - unsort_idx (torch.tensor): 1D unsorting index (inverse of sorted_idx) - lens: lengths of input data (sorted in descending order) - padded_data (torch.tensor): input sequences (padded) - recurrent_model (nn.Module): recurrent model to run data through - Returns: - hidden_vectors (torch.tensor): outputs of the RNN, in the original, - unsorted, ordering - """ - - # sort the data by decreasing length using provided index - # we assume batch index is in dim=1 - padded_data = padded_data[:, sorted_idx] - padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens) - hidden_vectors = recurrent_model(padded_data)[0] - hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors) - # unsort the results at dim=1 and return - hidden_vectors = hidden_vectors[:, unsort_idx] - return hidden_vectors - def forward(self, queries, keys, query_lens, mask=None, key_lens=None, attn_prior=None): """Attention mechanism for radtts. Unlike in Flowtron, we have no restrictions such as causality etc, since we only need this during diff --git a/nemo/collections/tts/modules/radtts.py b/nemo/collections/tts/modules/radtts.py index 8638d84ce84d..4bf9532f463a 100644 --- a/nemo/collections/tts/modules/radtts.py +++ b/nemo/collections/tts/modules/radtts.py @@ -443,6 +443,7 @@ def forward( attn = None attn_soft = None attn_hard = None + attn_logprob = None if 'atn' in self.include_modules or 'dec' in self.include_modules: # make sure to do the alignments before folding attn_mask = ~get_mask_from_lengths(in_lens)[..., None] @@ -581,13 +582,13 @@ def infer( self, speaker_id, text, - sigma, - sigma_txt=0.8, - sigma_f0=0.8, - sigma_energy=0.8, + sigma=0.7, + sigma_txt=0.7, + sigma_f0=1.0, + sigma_energy=1.0, speaker_id_text=None, speaker_id_attributes=None, - pace=1.0, + pace=None, token_duration_max=100, in_lens=None, dur=None, @@ -600,10 +601,16 @@ def infer( ): batch_size = text.shape[0] - n_tokens = text.shape[1] if in_lens is None: - in_lens = text.new_ones((batch_size,), dtype=torch.int) * n_tokens + in_lens = text.new_ones((batch_size,), dtype=torch.int64) * text.shape[1] + txt_len_pad_removed = text.shape[1] + else: + txt_len_pad_removed = torch.max(in_lens) + # borisf : this should not be needed as long as we have properly formed input batch + text = text[:, :txt_len_pad_removed] + spk_vec = self.encode_speaker(speaker_id) + if speaker_id_text is None: speaker_id_text = speaker_id if speaker_id_attributes is None: @@ -617,20 +624,15 @@ def infer( dur = self.dur_pred_layer.infer(txt_enc, spk_vec_text, lens=in_lens) dur = pad_dur(dur, txt_enc) dur = dur[:, 0] - dur = dur.clamp(0, token_duration_max) - # text encoded removes padding tokens so shape of text_enc is changed - # need to adjust pace, pitch_shift to account for this - txt_len_pad_removed = torch.max(in_lens) - pace = pace[:, :txt_len_pad_removed] - pitch_shift = pitch_shift[:, :txt_len_pad_removed].unsqueeze(-1) + dur = dur.clamp(1, token_duration_max) + + if pace is None: + pace = txt_enc.new_ones((batch_size, txt_len_pad_removed)) + else: + pace = pace[:, :txt_len_pad_removed] txt_enc_time_expanded, out_lens = regulate_len( - dur, - txt_enc.transpose(1, 2), - pace, - replicate_to_nearest_multiple=True, - group_size=self.n_group_size, - in_lens=in_lens, + dur, txt_enc.transpose(1, 2), pace, group_size=self.n_group_size, dur_lens=in_lens, ) n_groups = torch.div(out_lens, self.n_group_size, rounding_mode='floor') max_out_len = torch.max(out_lens) @@ -671,25 +673,18 @@ def infer( # FIXME: use replication pad (energy_avg, f0) = pad_energy_avg_and_f0(energy_avg, f0, max_out_len) - pitch_shift_spec_len = 0 if pitch_shift is not None: pitch_shift_spec_len, _ = regulate_len( dur, - pitch_shift, + pitch_shift[:, :txt_len_pad_removed].unsqueeze(-1), pace, - replicate_to_nearest_multiple=True, group_size=self.n_group_size, - in_lens=in_lens, + dur_lens=in_lens, ) - pitch_shift_spec_len = pitch_shift_spec_len.squeeze(-1) + f0_bias = pitch_shift_spec_len.squeeze(-1) + f0_bias context_w_spkvec = self.preprocess_context( - txt_enc_time_expanded, - spk_vec, - out_lens, - (f0 + f0_bias + pitch_shift_spec_len) * voiced_mask, - energy_avg, - assume_padded=True, + txt_enc_time_expanded, spk_vec, out_lens, (f0 + f0_bias) * voiced_mask, energy_avg, assume_padded=True, ) residual = txt_enc.new_zeros(batch_size, 80 * self.n_group_size, torch.max(n_groups)) @@ -698,7 +693,9 @@ def infer( # map from z sample to data num_steps_to_exit = len(self.exit_steps) - remaining_residual, mel = torch.tensor_split(residual, [num_steps_to_exit * self.n_early_size,], dim=1) + split = num_steps_to_exit * self.n_early_size + mel = residual[:, split:] + residual = residual[:, :split] for i, flow_step in enumerate(reversed(self.flows)): curr_step = self.n_flows - i - 1 @@ -706,9 +703,9 @@ def infer( if num_steps_to_exit > 0 and curr_step == self.exit_steps[num_steps_to_exit - 1]: # concatenate the next chunk of z num_steps_to_exit = num_steps_to_exit - 1 - remaining_residual, residual_to_add = torch.tensor_split( - remaining_residual, [num_steps_to_exit * self.n_early_size,], dim=1 - ) + split = num_steps_to_exit * self.n_early_size + residual_to_add = residual[:, split:] + residual = residual[:, :split] mel = torch.cat((residual_to_add, mel), 1) if self.n_group_size > 1: diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index e61b9b224885..275468d60634 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -172,7 +172,7 @@ def forward(self, signal, mask=None): ret = self.norm(ret, mask) else: if mask is not None: - signal = signal * mask + signal = signal.mul(mask) ret = self.conv(signal) if self.norm is not None: ret = self.norm(ret) diff --git a/nemo/collections/tts/modules/transformer.py b/nemo/collections/tts/modules/transformer.py index 25b3767e2df1..8073a24484fe 100644 --- a/nemo/collections/tts/modules/transformer.py +++ b/nemo/collections/tts/modules/transformer.py @@ -17,6 +17,8 @@ import torch.nn as nn import torch.nn.functional as F +from nemo.collections.tts.helpers.helpers import get_mask_from_lengths +from nemo.collections.tts.modules.submodules import LinearNorm from nemo.core.classes import NeuralModule, typecheck from nemo.core.neural_types.elements import EncodedRepresentation, LengthsType, MaskType, TokenIndex from nemo.core.neural_types.neural_type import NeuralType @@ -260,3 +262,51 @@ def input_types(self): def forward(self, input, conditioning=0): return self._forward(self.word_emb(input), (input != self.padding_idx).unsqueeze(2), conditioning) # (B, L, 1) + + +class FFTransformer(nn.Module): + def __init__( + self, + in_dim, + out_dim=1, + n_layers=6, + n_head=1, + d_head=64, + d_inner=1024, + kernel_size=3, + dropout=0.1, + dropatt=0.1, + dropemb=0.0, + ): + super(FFTransformer, self).__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.n_head = n_head + self.d_head = d_head + + self.pos_emb = PositionalEmbedding(self.in_dim) + self.drop = nn.Dropout(dropemb) + self.layers = nn.ModuleList() + + for _ in range(n_layers): + self.layers.append( + TransformerLayer(n_head, in_dim, d_head, d_inner, kernel_size, dropout, dropatt=dropatt) + ) + + self.dense = LinearNorm(in_dim, out_dim) + + def forward(self, dec_inp, in_lens): + # B, C, T --> B, T, C + inp = dec_inp.transpose(1, 2) + mask = get_mask_from_lengths(in_lens)[..., None] + + pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype) + pos_emb = self.pos_emb(pos_seq) * mask + + out = self.drop(inp + pos_emb) + + for layer in self.layers: + out = layer(out, mask=mask) + + out = self.dense(out).transpose(1, 2) + return out diff --git a/tests/collections/tts/test_helpers.py b/tests/collections/tts/test_helpers.py new file mode 100644 index 000000000000..68504f5e5d49 --- /dev/null +++ b/tests/collections/tts/test_helpers.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from nemo.collections.tts.helpers.helpers import regulate_len + + +def sample_duration_input(max_length=64, group_size=2): + generator = torch.Generator() + generator.manual_seed(0) + batch_size = 3 + lengths = torch.randint(max_length // 4, max_length - 7, (batch_size,), generator=generator) + durs = torch.ones(batch_size, max_length) * group_size + durs[0, lengths[0]] += 1 + durs[2, lengths[2]] -= 1 + enc = torch.randint(16, 64, (batch_size, max_length, 17)) + return durs, enc, lengths + + +@pytest.mark.unit +def test_regulate_len(): + group_size = 2 + durs_in, enc_in, dur_lens = sample_duration_input(group_size=group_size) + enc_out, lens_out = regulate_len(durs_in, enc_in, group_size=group_size, dur_lens=dur_lens) + # make sure lens_out are rounded + sum_diff = lens_out - torch.mul(lens_out // group_size, group_size) + assert sum_diff.sum(dim=0) == 0 + # make sure all round-ups are <= group_size + diff = lens_out - durs_in.sum(dim=1) + assert torch.max(diff) < group_size diff --git a/tests/collections/tts/test_tts_exportables.py b/tests/collections/tts/test_tts_exportables.py index 023f542551ca..f23ef874966f 100644 --- a/tests/collections/tts/test_tts_exportables.py +++ b/tests/collections/tts/test_tts_exportables.py @@ -81,20 +81,19 @@ def test_RadTTSModel_export_to_torchscript(self, radtts_model): with tempfile.TemporaryDirectory() as tmpdir: filename = os.path.join(tmpdir, 'rad.ts') with torch.cuda.amp.autocast(enabled=True, cache_enabled=False, dtype=torch.float16): - input_example1 = model.input_module.input_example(max_batch=3, max_dim=777) - input_example2 = model.input_module.input_example(max_batch=16, max_dim=1024) + input_example1 = model.input_module.input_example(max_batch=13, max_dim=777) + input_example2 = model.input_module.input_example(max_batch=19, max_dim=999) model.export(output=filename, verbose=True, input_example=input_example1, check_trace=[input_example2]) - @pytest.mark.pleasefixme('ONNX not working yet. Restore when Pytorch fixes LSTM/ONNX bugs.') @pytest.mark.run_only_on('GPU') @pytest.mark.unit def test_RadTTSModel_export_to_onnx(self, radtts_model): model = radtts_model.cuda() with tempfile.TemporaryDirectory() as tmpdir: filename = os.path.join(tmpdir, 'rad.onnx') - with torch.cuda.amp.autocast(enabled=False): - input_example1 = model.input_module.input_example(max_batch=3, max_dim=776) - input_example2 = model.input_module.input_example(max_batch=16, max_dim=998) + with torch.cuda.amp.autocast(enabled=True, cache_enabled=False, dtype=torch.float16): + input_example1 = model.input_module.input_example(max_batch=13, max_dim=777) + input_example2 = model.input_module.input_example(max_batch=19, max_dim=999) model.export( output=filename, input_example=input_example1, verbose=True, check_trace=[input_example2], )