TensorSpeech · nglehuy · Dec 27, 2020 · Dec 27, 2020 · Dec 27, 2020
diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml
@@ -56,6 +56,7 @@ model_config:
   prediction_layer_norm: True
   prediction_projection_units: 0
   joint_dim: 320
+  joint_activation: tanh
 
 learning_config:
   augmentations:

diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml
@@ -192,6 +192,7 @@ model_config:
   prediction_layer_norm: True
   prediction_projection_units: 0
   joint_dim: 640
+  joint_activation: tanh
 
 learning_config:
   augmentations:

diff --git a/examples/deepspeech2/README.md b/examples/deepspeech2/README.md
@@ -29,4 +29,5 @@ model_config:
 
 See `python examples/deepspeech2/train_*.py --help`
 
-See `python examples/deepspeech2/test_*.py --help`
+See `python examples/deepspeech2/test_*.py --help`
+
diff --git a/examples/jasper/README.md b/examples/jasper/README.md
@@ -37,4 +37,5 @@ model_config:
 
 See `python examples/jasper/train_*.py --help`
 
-See `python examples/jasper/test_*.py --help`
+See `python examples/jasper/test_*.py --help`
+
diff --git a/examples/streaming_transducer/config.yml b/examples/streaming_transducer/config.yml
@@ -49,6 +49,7 @@ model_config:
   prediction_projection_units: 320
   prediction_layer_norm: True
   joint_dim: 320
+  joint_activation: tanh
 
 learning_config:
   augmentations:

diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@
 
 setuptools.setup(
     name="TensorFlowASR",
-    version="0.6.0",
+    version="0.6.1",
     author="Huy Le Nguyen",
     author_email="[email protected]",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",

diff --git a/tensorflow_asr/models/conformer.py b/tensorflow_asr/models/conformer.py
@@ -384,6 +384,7 @@ def __init__(self,
                  prediction_layer_norm: bool = True,
                  prediction_projection_units: int = 0,
                  joint_dim: int = 1024,
+                 joint_activation: str = "tanh",
                  kernel_regularizer=L2,
                  bias_regularizer=L2,
                  name: str = "conformer_transducer",
@@ -414,6 +415,7 @@ def __init__(self,
             layer_norm=prediction_layer_norm,
             projection_units=prediction_projection_units,
             joint_dim=joint_dim,
+            joint_activation=joint_activation,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             name=name, **kwargs

diff --git a/tensorflow_asr/models/contextnet.py b/tensorflow_asr/models/contextnet.py
@@ -196,7 +196,7 @@ class ContextNet(Transducer):
     def __init__(self,
                  vocabulary_size: int,
                  encoder_blocks: List[dict],
-                 encoder_alpha: float,
+                 encoder_alpha: float = 0.5,
                  prediction_embed_dim: int = 512,
                  prediction_embed_dropout: int = 0,
                  prediction_num_rnns: int = 1,
@@ -206,6 +206,7 @@ def __init__(self,
                  prediction_layer_norm: bool = True,
                  prediction_projection_units: int = 0,
                  joint_dim: int = 1024,
+                 joint_activation: str = "tanh",
                  kernel_regularizer=L2,
                  bias_regularizer=L2,
                  name: str = "contextnet",
@@ -228,6 +229,7 @@ def __init__(self,
             layer_norm=prediction_layer_norm,
             projection_units=prediction_projection_units,
             joint_dim=joint_dim,
+            joint_activation=joint_activation,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             name=name, **kwargs

diff --git a/tensorflow_asr/models/ctc.py b/tensorflow_asr/models/ctc.py
@@ -25,6 +25,7 @@
 class CtcModel(Model):
     def __init__(self, **kwargs):
         super(CtcModel, self).__init__(**kwargs)
+        self.time_reduction_factor = 1
 
     def _build(self, input_shape):
         features = tf.keras.Input(input_shape, dtype=tf.float32)
@@ -67,7 +68,7 @@ def recognize_tflite(self, signal):
         features = self.speech_featurizer.tf_extract(signal)
         features = tf.expand_dims(features, axis=0)
         input_length = shape_list(features)[1]
-        input_length = get_reduced_length(input_length, self.base_model.time_reduction_factor)
+        input_length = get_reduced_length(input_length, self.time_reduction_factor)
         input_length = tf.expand_dims(input_length, axis=0)
         logits = self(features, training=False)
         probs = tf.nn.softmax(logits)
@@ -113,7 +114,7 @@ def recognize_beam_tflite(self, signal):
         features = self.speech_featurizer.tf_extract(signal)
         features = tf.expand_dims(features, axis=0)
         input_length = shape_list(features)[1]
-        input_length = get_reduced_length(input_length, self.base_model.time_reduction_factor)
+        input_length = get_reduced_length(input_length, self.time_reduction_factor)
         input_length = tf.expand_dims(input_length, axis=0)
         logits = self(features, training=False)
         probs = tf.nn.softmax(logits)

diff --git a/tensorflow_asr/models/streaming_transducer.py b/tensorflow_asr/models/streaming_transducer.py
@@ -192,6 +192,7 @@ def __init__(self,
                  prediction_layer_norm: bool = True,
                  prediction_projection_units: int = 640,
                  joint_dim: int = 640,
+                 joint_activation: str = "tanh",
                  kernel_regularizer = None,
                  bias_regularizer = None,
                  name = "StreamingTransducer",
@@ -217,6 +218,7 @@ def __init__(self,
             layer_norm=prediction_layer_norm,
             projection_units=prediction_projection_units,
             joint_dim=joint_dim,
+            joint_activation=joint_activation,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             name=name, **kwargs

diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer.py
@@ -146,11 +146,19 @@ class TransducerJoint(tf.keras.Model):
     def __init__(self,
                  vocabulary_size: int,
                  joint_dim: int = 1024,
+                 activation: str = "tanh",
                  kernel_regularizer=None,
                  bias_regularizer=None,
                  name="tranducer_joint",
                  **kwargs):
         super(TransducerJoint, self).__init__(name=name, **kwargs)
+
+        activation = activation.lower()
+        if activation == "linear": self.activation = tf.keras.activation.linear
+        elif activation == "relu": self.activation = tf.nn.relu
+        elif activation == "tanh": self.activation = tf.nn.tanh
+        else: raise ValueError("activation must be either 'linear', 'relu' or 'tanh'")
+
         self.ffn_enc = tf.keras.layers.Dense(
             joint_dim, name=f"{name}_enc",
             kernel_regularizer=kernel_regularizer,
@@ -174,7 +182,7 @@ def call(self, inputs, training=False, **kwargs):
         pred_out = self.ffn_pred(pred_out, training=training)  # [B, U, P] => [B, U, V]
         enc_out = tf.expand_dims(enc_out, axis=2)
         pred_out = tf.expand_dims(pred_out, axis=1)
-        outputs = tf.nn.tanh(enc_out + pred_out)  # => [B, T, U, V]
+        outputs = self.activation(enc_out + pred_out)  # => [B, T, U, V]
         outputs = self.ffn_out(outputs, training=training)
         return outputs
 
@@ -200,6 +208,7 @@ def __init__(self,
                  layer_norm: bool = True,
                  projection_units: int = 0,
                  joint_dim: int = 1024,
+                 joint_activation: str = "tanh",
                  kernel_regularizer=None,
                  bias_regularizer=None,
                  name="transducer",
@@ -223,6 +232,7 @@ def __init__(self,
         self.joint_net = TransducerJoint(
             vocabulary_size=vocabulary_size,
             joint_dim=joint_dim,
+            activation=joint_activation,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             name=f"{name}_joint"

diff --git a/tests/conformer/config.yml b/tests/conformer/config.yml
@@ -0,0 +1,95 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+speech_config:
+  sample_rate: 16000
+  frame_ms: 25
+  stride_ms: 10
+  num_feature_bins: 80
+  feature_type: log_mel_spectrogram
+  preemphasis: 0.97
+  normalize_signal: True
+  normalize_feature: True
+  normalize_per_feature: False
+
+decoder_config:
+  vocabulary: null
+  target_vocab_size: 1024
+  max_subword_length: 4
+  blank_at_zero: True
+  beam_width: 5
+  norm_score: True
+
+model_config:
+  name: conformer
+  encoder_subsampling:
+    type: conv2d
+    filters: 144
+    kernel_size: 3
+    strides: 2
+  encoder_positional_encoding: sinusoid_concat
+  encoder_dmodel: 144
+  encoder_num_blocks: 16
+  encoder_head_size: 36
+  encoder_num_heads: 4
+  encoder_mha_type: relmha
+  encoder_kernel_size: 32
+  encoder_fc_factor: 0.5
+  encoder_dropout: 0.1
+  prediction_embed_dim: 320
+  prediction_embed_dropout: 0
+  prediction_num_rnns: 1
+  prediction_rnn_units: 320
+  prediction_rnn_type: lstm
+  prediction_rnn_implementation: 1
+  prediction_layer_norm: True
+  prediction_projection_units: 0
+  joint_dim: 320
+  joint_activation: tanh
+
+learning_config:
+  augmentations:
+    after:
+      time_masking:
+        num_masks: 10
+        mask_factor: 100
+        p_upperbound: 0.05
+      freq_masking:
+        num_masks: 1
+        mask_factor: 27
+
+  dataset_config:
+    train_paths:
+      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
+    eval_paths:
+      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
+      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
+    test_paths:
+      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
+    tfrecords_dir: null
+
+  optimizer_config:
+    warmup_steps: 40000
+    beta1: 0.9
+    beta2: 0.98
+    epsilon: 1e-9
+
+  running_config:
+    batch_size: 2
+    accumulation_steps: 4
+    num_epochs: 20
+    outdir: /mnt/Miscellanea/Models/local/conformer
+    log_interval_steps: 300
+    eval_interval_steps: 500
+    save_interval_steps: 1000
diff --git a/tests/conformer/test_conformer.py b/tests/conformer/test_conformer.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+
+
+def test_conformer():
+    config = Config(DEFAULT_YAML, learning=False)
+
+    text_featurizer = CharFeaturizer(config.decoder_config)
+
+    speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+
+    model = Conformer(vocabulary_size=text_featurizer.num_classes, **config.model_config)
+
+    model._build(speech_featurizer.shape)
+    model.summary(line_length=150)
+
+    model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer)
+
+    concrete_func = model.make_tflite_function(timestamp=False).get_concrete_function()
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.experimental_new_converter = True
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.convert()
+
+    print("Converted successfully with no timestamp")
+
+    concrete_func = model.make_tflite_function(timestamp=True).get_concrete_function()
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.experimental_new_converter = True
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.convert()
+
+    print("Converted successfully with timestamp")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,4 +29,5 @@ model_config:

		See `python examples/deepspeech2/train_*.py --help`

		See `python examples/deepspeech2/test_*.py --help`
		See `python examples/deepspeech2/test_*.py --help`
Original file line number	Diff line number	Diff line change
Expand Up		@@ -37,4 +37,5 @@ model_config:

		See `python examples/jasper/train_*.py --help`

		See `python examples/jasper/test_*.py --help`
		See `python examples/jasper/test_*.py --help`