NVIDIA · sam1373 · Jan 26, 2022 · Dec 15, 2021 · Dec 15, 2021 · Dec 15, 2021
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -295,7 +295,7 @@ pipeline {
         stage('L2: Speech Pre-training - CitriNet') {
           steps {
             sh 'python examples/asr/speech_pretraining/speech_pre_training.py \
-            --config-path="../conf/citrinet_ssl/" --config-name="citrinet_ssl_ci" \
+            --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             trainer.gpus=[1] \

diff --git a/examples/asr/conf/citrinet/citrinet_1024.yaml b/examples/asr/conf/citrinet/citrinet_1024.yaml
@@ -55,6 +55,7 @@ model:
     se_context_size: -1
     kernel_size_factor: 0.25
     filters: 1024
+    enc_final: 1024
 
   tokenizer:
     dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
@@ -400,7 +401,7 @@ model:
 
 
 
-      - filters: &enc_final 1024
+      - filters: ${model.model_defaults.enc_final}
         repeat: 1
         kernel: [41]
         stride: [1]
@@ -415,7 +416,7 @@ model:
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder
-    feat_in: *enc_final
+    feat_in: ${model.model_defaults.enc_final}
     num_classes: -1  # filled with vocabulary size from tokenizer at runtime
     vocabulary: []  # filled with vocabulary from tokenizer at runtime
 

diff --git a/examples/asr/conf/citrinet/citrinet_384.yaml b/examples/asr/conf/citrinet/citrinet_384.yaml
@@ -54,6 +54,7 @@ model:
     se: true
     se_context_size: -1
     kernel_size_factor: 1.0
+    enc_final: 640
 
   tokenizer:
     dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
@@ -356,7 +357,7 @@ model:
         se_context_size: ${model.model_defaults.se_context_size}
         kernel_size_factor: ${model.model_defaults.kernel_size_factor}
 
-      - filters: &enc_final 640
+      - filters: ${model.model_defaults.enc_final}
         repeat: 1
         kernel: [41]
         stride: [1]
@@ -371,7 +372,7 @@ model:
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder
-    feat_in: *enc_final
+    feat_in: ${model.model_defaults.enc_final}
     num_classes: -1  # filled with vocabulary size from tokenizer at runtime
     vocabulary: []  # filled with vocabulary from tokenizer at runtime
 

diff --git a/examples/asr/conf/citrinet/citrinet_512.yaml b/examples/asr/conf/citrinet/citrinet_512.yaml
@@ -53,6 +53,7 @@ model:
     se: true
     se_context_size: -1
     kernel_size_factor: 1.0
+    enc_final: 640
 
   tokenizer:
     dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
@@ -355,7 +356,7 @@ model:
         se_context_size: ${model.model_defaults.se_context_size}
         kernel_size_factor: ${model.model_defaults.kernel_size_factor}
 
-      - filters: &enc_final 640
+      - filters: ${model.model_defaults.enc_final}
         repeat: 1
         kernel: [41]
         stride: [1]
@@ -370,7 +371,7 @@ model:
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder
-    feat_in: *enc_final
+    feat_in: ${model.model_defaults.enc_final}
     num_classes: -1  # filled with vocabulary size from tokenizer at runtime
     vocabulary: []  # filled with vocabulary from tokenizer at runtime
 

diff --git a/.../conf/citrinet_ssl/citrinet_ssl_1024.yaml → .../conf/ssl/citrinet/citrinet_ssl_1024.yaml b/.../conf/citrinet_ssl/citrinet_ssl_1024.yaml → .../conf/ssl/citrinet/citrinet_ssl_1024.yaml
@@ -24,10 +24,10 @@ model:
     max_duration: 35.0
     min_duration: 3.0
     shuffle: true
-    # tarred datasets
     is_tarred: false
     tarred_audio_filepaths: null
     shuffle_n: 2048
+    use_start_end_token: false
     # bucketing params
     bucketing_strategy: "synced_randomized"
     bucketing_batch_size: null
@@ -50,6 +50,7 @@ model:
     kernel_size_factor: 0.25
     filters: 1024
     decoder_out_channels: 128
+    enc_final: 1024
 
 
   preprocessor:
@@ -389,7 +390,7 @@ model:
 
 
 
-      - filters: &enc_final 1024
+      - filters: ${model.model_defaults.enc_final}
         repeat: 1
         kernel: [41]
         stride: [1]
@@ -404,7 +405,7 @@ model:
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
-    feat_in: *enc_final
+    feat_in: ${model.model_defaults.enc_final}
     feat_hidden: 128
     feat_out: ${model.model_defaults.decoder_out_channels}
     stride_layers: 1

diff --git a/...sr/conf/citrinet_ssl/citrinet_ssl_ci.yaml → ...sr/conf/ssl/citrinet/citrinet_ssl_ci.yaml b/...sr/conf/citrinet_ssl/citrinet_ssl_ci.yaml → ...sr/conf/ssl/citrinet/citrinet_ssl_ci.yaml
@@ -13,11 +13,9 @@ model:
     max_duration: 35.0
     min_duration: 3.0
     shuffle: true
-    use_start_end_token: false
-    # tarred datasets
     is_tarred: false
     tarred_audio_filepaths: null
-    shuffle_n: 2048
+    use_start_end_token: false
     # bucketing params
     bucketing_strategy: "synced_randomized"
     bucketing_batch_size: null

diff --git a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
@@ -0,0 +1,188 @@
+# This config contains the default values for self-supervised pre-training of a Conformer ASR model, large size (~120M).
+
+# Architecture and training config:
+# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
+# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file.
+# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one.
+#
+#  +-------------+---------+---------+----------+------------+-----+
+#  | Model       | d_model | n_heads | n_layers | time_masks | lr  |
+#  +=============+=========+========+===========+============+=====+
+#  | Small  (13M)|   176   |    4   |    16     |     5      | 5.0 |
+#  +-------------+---------+--------+-----------+------------+-----+
+#  | Medium (30M)|   256   |    4   |    18     |     5      | 5.0 |
+#  +-------------+---------+--------+-----------+------------+-----+
+#  | Large (121M)|   512   |    8   |    18     |     10     | 2.0 |
+#  +---------------------------------------------------------------+
+#
+# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2
+# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence.
+# With weight_decay=0.0, learning rate may need to get reduced to 2.0.
+
+name: "Conformer-SSL"
+
+model:
+  sample_rate: 16000
+  log_prediction: true # enables logging sample predictions in the output during training
+  ctc_reduction: 'mean_batch'
+
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: true
+    num_workers: 8
+    pin_memory: false
+    use_start_end_token: true
+    trim_silence: false
+    max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
+    min_duration: 3.0
+    # tarred datasets
+    is_tarred: false
+    tarred_audio_filepaths: null
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "synced_randomized"
+    bucketing_batch_size: null
+
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: false
+    num_workers: 8
+    pin_memory: true
+    use_start_end_token: false
+
+
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: &n_mels 80
+    n_fft: 512
+    log: true
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 16
+    pad_value: 0.0
+
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 4
+    time_masks: 10
+    freq_width: 27
+    time_width: 0.05
+
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: ${model.preprocessor.features}
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 18
+    d_model: 512
+
+    # Sub-sampling params
+    subsampling: striding # vggnet or striding, vggnet may give better results but needs more memory
+    subsampling_factor: 4 # must be power of 2
+    subsampling_conv_channels: -1 # -1 sets it to d_model
+
+    # Feed forward module's params
+    ff_expansion_factor: 4
+
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos or abs_pos
+    n_heads: 8 # may need to be lower for smaller d_models
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+    att_context_size: [-1, -1] # -1 means unlimited context
+    xscaling: true # scales up the input embeddings by sqrt(d_model)
+    untie_biases: true # unties the biases of the TransformerXL layers
+    pos_emb_max_len: 5000
+
+    # Convolution module's params
+    conv_kernel_size: 31
+    conv_norm_type: 'batch_norm' # batch_norm or layer_norm
+
+    ### regularization
+    dropout: 0.1 # The dropout used in most of the Conformer Modules
+    dropout_emb: 0.0 # The dropout used for embeddings
+    dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+  dec_out: &dec_out 128
+
+  decoder:
+    _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
+    feat_in: ${model.encoder.d_model}
+    feat_hidden: 128
+    feat_out: *dec_out
+    stride_layers: 0
+    non_stride_layers: 2
+
+
+  loss:
+    _target_: nemo.collections.asr.losses.ContrastiveLoss
+    in_dim: *n_mels
+    proj_dim: *dec_out
+    combine_time_steps: 4
+    codebook_size: 1200
+
+  optim:
+    name: adamw
+    lr: 5.0
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+
+    # scheduler setup
+    sched:
+      name: NoamAnnealing
+      d_model: ${model.encoder.d_model}
+      # scheduler config override
+      warmup_steps: 10000
+      warmup_ratio: null
+      min_lr: 1e-6
+
+trainer:
+  gpus: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: 1000
+  max_steps: null # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: 0.0
+  amp_level: O0 # O1/O2 for mixed precision
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 10  # Interval of logging.
+  progress_bar_refresh_rate: 10
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  checkpoint_callback: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: "val_loss"
+    mode: "min"
+    save_top_k: 5
+
+  # you need to set these two to True to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null