appendices.tex

\chapter{Model architecture}
\label{chapter:model-architecture}

\section*{Encoder Architecture}

\begin{verbatim}
CRDNN(
  (CNN): Sequential(
    (block_0): CNN_Block(
      (conv_1): Conv2d(
        (conv): Conv2d(1, 128, kernel_size=(3, 3), 
                        stride=(1, 1))
      )
      (norm_1): LayerNorm(
        (norm): LayerNorm((40, 128), eps=1e-05, 
                            elementwise_affine=True)
      )
      (act_1): LeakyReLU(negative_slope=0.01)
      (conv_2): Conv2d(
        (conv): Conv2d(128, 128, kernel_size=(3, 3), 
                        stride=(1, 1))
      )
      (norm_2): LayerNorm(
        (norm): LayerNorm((40, 128), eps=1e-05, 
                        elementwise_affine=True)
      )
      (act_2): LeakyReLU(negative_slope=0.01)
      (pooling): Pooling1d(
        (pool_layer): MaxPool2d(kernel_size=(1, 2), 
                            stride=(1, 2), padding=(0, 0), 
                            dilation=(1, 1), ceil_mode=False)
      )
      (drop): Dropout2d(
        (drop): Dropout2d(p=0.15, inplace=False)
      )
    )
    (block_1): CNN_Block(
      (conv_1): Conv2d(
        (conv): Conv2d(128, 256, kernel_size=(3, 3), 
                        stride=(1, 1))
      )
      (norm_1): LayerNorm(
        (norm): LayerNorm((20, 256), 
                            eps=1e-05, elementwise_affine=True)
      )
      (act_1): LeakyReLU(negative_slope=0.01)
      (conv_2): Conv2d(
        (conv): Conv2d(256, 256, kernel_size=(3, 3), 
                        stride=(1, 1))
      )
      (norm_2): LayerNorm(
        (norm): LayerNorm((20, 256), eps=1e-05, 
                        elementwise_affine=True)
      )
      (act_2): LeakyReLU(negative_slope=0.01)
      (pooling): Pooling1d(
        (pool_layer): MaxPool2d(kernel_size=(1, 2), 
                            stride=(1, 2), padding=(0, 0), 
                            dilation=(1, 1), ceil_mode=False)
      )
      (drop): Dropout2d(
        (drop): Dropout2d(p=0.15, inplace=False)
      )
    )
  )
  (time_pooling): Pooling1d(
    (pool_layer): MaxPool2d(kernel_size=(1, 4), 
                    stride=(1, 4), padding=(0, 0), 
                    dilation=(1, 1), ceil_mode=False)
  )
  (RNN): LSTM(
    (rnn): LSTM(2560, 512, num_layers=4, 
            batch_first=True, dropout=0.15, bidirectional=True)
  )
  (DNN): Sequential(
    (block_0): DNN_Block(
      (linear): Linear(
        (w): Linear(in_features=1024, 
                    out_features=256, bias=True)
      )
      (norm): BatchNorm1d(
        (norm): BatchNorm1d(256, eps=1e-05, 
                    momentum=0.1, affine=True, 
                    track_running_stats=True)
      )
      (act): LeakyReLU(negative_slope=0.01)
      (dropout): Dropout(p=0.15, inplace=False)
    )
    (block_1): DNN_Block(
      (linear): Linear(
        (w): Linear(in_features=256, out_features=256, 
                    bias=True)
      )
      (norm): BatchNorm1d(
        (norm): BatchNorm1d(256, eps=1e-05, 
                        momentum=0.1, affine=True, 
                        track_running_stats=True)
      )
      (act): LeakyReLU(negative_slope=0.01)
      (dropout): Dropout(p=0.15, inplace=False)
    )
  )
)
\end{verbatim}


\section*{Decoder Architecture}

\begin{verbatim}
AttentionalRNNDecoder(
  (proj): Linear(in_features=1024, out_features=512, bias=True)
  (attn): LocationAwareAttention(
    (mlp_enc): Linear(in_features=256, 
                    out_features=512, bias=True)
    (mlp_dec): Linear(in_features=512,
                    out_features=512, bias=True)
    (mlp_attn): Linear(in_features=512, 
                    out_features=1, bias=False)
    (conv_loc): Conv1d(1, 10, kernel_size=(201,), 
                    stride=(1,), padding=(100,), 
                    bias=False)
    (mlp_loc): Linear(in_features=10, 
                    out_features=512, bias=True)
    (mlp_out): Linear(in_features=256, 
                    out_features=512, bias=True)
    (softmax): Softmax(dim=-1)
  )
  (drop): Dropout(p=0.15, inplace=False)
  (rnn): GRUCell(
    (rnn_cells): ModuleList(
      (0): GRUCell(640, 512)
    )
    (dropout_layers): ModuleList()
  )
)

\end{verbatim}


\chapter{Configuration Files}
\label{chapter:hyparam}

\section*{dataset.yaml}
\begin{verbatim}
seed: 98585
__set_seed: !apply:torch.manual_seed [!ref <seed>]
dataset_seed: 26000

output_folder: !ref runs/<seed>
save_folder: !ref <output_folder>/save
local_dataset_folder: !ref /m/triton/scratch/
    biz/bizspeech/ASR_Datasets/<dataset_seed>/datasets 

# Path where data manifest files will be stored
train_annotation: !ref <local_dataset_folder>/train.json
valid_annotation: !ref <local_dataset_folder>/val.json
test_annotation: !ref <local_dataset_folder>/train.json

# Webdataset Parameters
use_wds: True
number_of_shards: 3000
#shard_maxcount: 10000
shardfiles_pattern: !ref <local_dataset_folder>/
    bizspeech_shard-%06d.tar.gz
use_compression: True
train_shards: (0, 2994)
val_shards: (2994, 2997)
test_shards: (2997, 3000)

# Audio Resampling and convert to single channel
sample_rate: 16000
preprocess_audio: !new:speechbrain.dataio.
            preprocess.AudioNormalizer
    sample_rate: !ref <sample_rate>
    mix: avg-to-mono

# Set up folders for reading from and writing to
data_folder: /scratch/biz/bizspeech/MEDIA 
hours_reqd: 26000
nonnative: 0.3
qna: 0.6
sorting: random # ascending
non_CEO_utt: True
strict_included: False
utterance_duration_limit: 1000 # in seconds
audio_filetype: wav
trainValTest: (1,0)
\end{verbatim}

\section*{tokenizer.yaml}
\begin{verbatim}
dataset: !include:dataset.yaml

# Tokenizer parameters
token_type: bpe  # ["unigram", "bpe", "char"]
token_output: 5000  # index(blank/eos/bos/unk) = 0
character_coverage: 1.0
annotation_read: txt # field to read

# Tokenizer train object
tokenizer_train: !name:speechbrain.
    tokenizers.SentencePiece.SentencePiece
   model_dir: !ref <dataset[save_folder]>
   vocab_size: !ref <token_output>
   annotation_train: !ref <dataset[train_annotation]>
   annotation_read: !ref <annotation_read>
   model_type: !ref <token_type> # ["unigram", "bpe", "char"]
   character_coverage: !ref <character_coverage>
   annotation_list_to_check:
     - !ref <dataset[train_annotation]>
     - !ref <dataset[valid_annotation]>
   annotation_format: json
   
\end{verbatim}

\section*{train.yaml}
\begin{verbatim}
dataset: !include:dataset_200.yaml
tokenizer_params: !include:tokenizer.yaml

train_WER_required: False
wer_file: !ref <dataset[output_folder]>/wer.txt
wer_file_p: !ref <dataset[output_folder]>/wer_p.txt
train_log: !ref <dataset[output_folder]>/train_log.txt
tensorboard_dir: !ref runs/tensorboard_log/<dataset[seed]>

tokenizer: !new:sentencepiece.SentencePieceProcessor

pretrainer: !new:speechbrain.
    utils.parameter_transfer.Pretrainer
   collect_in: !ref <dataset[save_folder]>
   loadables:
      tokenizer: !ref <tokenizer>
      #model: !ref <model>
   paths:
      tokenizer: !ref <dataset[save_folder]>/
        <tokenizer_params[token_output]>_
        <tokenizer_params[token_type]>.model

gradient_accumulation: False
subbatches_count_for_grad_acc: 4
require_native_wer: False
wer_native_file: !ref <dataset[output_folder]>
    /wer_native.txt
wer_nonnative_file: !ref <dataset[output_folder]>
    /wer_nonnative.txt
libri:
    test_on_librispeech: False
    data_folder: librispeech
    test_csv:
        - !ref <libri[data_folder]>/test-clean.csv
        - !ref <libri[data_folder]>/test-other.csv
    tokenizer: !ref <tokenizer>
    bos_index: 0
    eos_index: 0
    test_dataloader_opts:
        batch_size: 8
        num_workers: 1
    
train_logger: !new:speechbrain.
    utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

log_to_tensorboard: True
tensorboard_logger: !new:speechbrain.
    utils.train_logger.TensorboardLogger
    save_dir: !ref <tensorboard_dir>

# Training parameters
number_of_epochs: 50
number_of_ctc_epochs: 15
batch_size: 8
lr: 0.0001
ctc_weight: 0.5
ckpt_interval_minutes: 15 
label_smoothing: 0.1

# Dataloader options
train_dataloader_opts:    
    batch_size: null
    num_workers: 8
valid_dataloader_opts:
    batch_size: null
    num_workers: 1
test_dataloader_opts:
    batch_size: null
    num_workers: 1

use_dynamic_batch_size: True
looped_nominal_epoch: 5000
dynamic_batch_kwargs:  
    len_key: "sig"
    min_sample_len: 15999 
    # 1s * 16000
    max_sample_len: 960000
     # 60s * 16000 (Sample rate)
    sampler_kwargs:
        target_batch_numel: 2880000  
        # 180s * 16000 (Sample rate)
        max_batch_numel: 3840000 
        # 240s * 16000 (Sample rate)


# Feature parameters
n_fft: 400
n_mels: 40

# Model parameters
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 4
rnn_neurons: 512
rnn_bidirectional: True
dnn_blocks: 2
dnn_neurons: 256
emb_size: 128
dec_neurons: 512
output_neurons: 5000  
blank_index: 0
bos_index: 0
eos_index: 0
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 80
#test_beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: True
max_attn_shift: 240
lm_weight: 0.50
ctc_weight_decode: 0.0
coverage_penalty: 1.5
temperature: 1.25

epoch_counter: !new:speechbrain.
    utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Feature extraction
compute_features: !new:speechbrain.
    lobes.features.Fbank
    sample_rate: !ref <dataset[sample_rate]>
    n_fft: !ref <n_fft>
    n_mels: !ref <n_mels>

# Feature normalization (mean and std)
normalize: !new:speechbrain.
    processing.features.InputNormalization
    
encoder: !new:speechbrain.
        lobes.models.CRDNN.CRDNN
    input_shape: [null, null, !ref <n_mels>]
    activation: !ref <activation>
    dropout: !ref <dropout>
    cnn_blocks: !ref <cnn_blocks>
    cnn_channels: !ref <cnn_channels>
    cnn_kernelsize: !ref <cnn_kernelsize>
    inter_layer_pooling_size: 
        !ref <inter_layer_pooling_size>
    time_pooling: True
    using_2d_pooling: False
    time_pooling_size: !ref <time_pooling_size>
    rnn_class: !ref <rnn_class>
    rnn_layers: !ref <rnn_layers>
    rnn_neurons: !ref <rnn_neurons>
    rnn_bidirectional: !ref <rnn_bidirectional>
    rnn_re_init: True
    dnn_blocks: !ref <dnn_blocks>
    dnn_neurons: !ref <dnn_neurons>
    use_rnnp: False

embedding: !new:speechbrain.
    nnet.embedding.Embedding
    num_embeddings: !ref <output_neurons>
    embedding_dim: !ref <emb_size>

# Attention-based RNN decoder.
decoder: !new:speechbrain.
    nnet.RNN.AttentionalRNNDecoder
    enc_dim: !ref <dnn_neurons>
    input_size: !ref <emb_size>
    rnn_type: gru
    attn_type: location
    hidden_size: !ref <dec_neurons>
    attn_dim: 512
    num_layers: 1
    scaling: 1.0
    channels: 10
    kernel_size: 100
    re_init: True
    dropout: !ref <dropout>

# Linear transformation on the top of the encoder.
ctc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dnn_neurons>
    n_neurons: !ref <output_neurons>

# Linear transformation on the top of the decoder.
seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dec_neurons>
    n_neurons: !ref <output_neurons>

# Final softmax (for log posteriors computation).
log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

# Cost definition for the CTC part.
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
    blank_index: !ref <blank_index>


modules:
    encoder: !ref <encoder>
    embedding: !ref <embedding>
    decoder: !ref <decoder>
    ctc_lin: !ref <ctc_lin>
    seq_lin: !ref <seq_lin>
    normalize: !ref <normalize>

model: !new:torch.nn.ModuleList
    - - !ref <encoder>
      - !ref <embedding>
      - !ref <decoder>
      - !ref <ctc_lin>
      - !ref <seq_lin>

valid_search: !new:speechbrain.
        decoders.S2SRNNGreedySearcher
    embedding: !ref <embedding>
    decoder: !ref <decoder>
    linear: !ref <seq_lin>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>

test_search: !new:speechbrain.
        decoders.S2SRNNGreedySearcher
    embedding: !ref <embedding>
    decoder: !ref <decoder>
    linear: !ref <seq_lin>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>

error_rate_computer: !name:speechbrain.
    utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.
    utils.metric_stats.ErrorRateStats
    split_tokens: True

checkpointer: !new:speechbrain.
    utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <dataset[save_folder]>
    recoverables:
        model: !ref <model>
        scheduler: !ref <lr_annealing>
        normalizer: !ref <normalize>
\end{verbatim}