TensorSpeech · nglehuy · Oct 28, 2021 · Oct 10, 2021 · Oct 27, 2021 · Oct 27, 2021
diff --git a/README.md b/README.md
@@ -38,8 +38,8 @@ TensorFlowASR implements some automatic speech recognition architectures such as
   - [Baselines](#baselines)
   - [Publications](#publications)
 - [Installation](#installation)
+  - [Installing from source (recommended)](#installing-from-source-recommended)
   - [Installing via PyPi](#installing-via-pypi)
-  - [Installing from source](#installing-from-source)
   - [Running in a container](#running-in-a-container)
 - [Setup training and testing](#setup-training-and-testing)
 - [TFLite Convertion](#tflite-convertion)
@@ -59,42 +59,33 @@ TensorFlowASR implements some automatic speech recognition architectures such as
 
 ### Baselines
 
-- **CTCModel** (End2end models using CTC Loss for training, currently supported DeepSpeech2, Jasper)
 - **Transducer Models** (End2end models using RNNT Loss for training, currently supported Conformer, ContextNet, Streaming Transducer)
+- **CTCModel** (End2end models using CTC Loss for training, currently supported DeepSpeech2, Jasper)
 
 ### Publications
 
-- **Deep Speech 2** (Reference: [https://arxiv.org/abs/1512.02595](https://arxiv.org/abs/1512.02595))
-  See [examples/deepspeech2](./examples/deepspeech2)
-- **Jasper** (Reference: [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288))
-  See [examples/jasper](./examples/jasper)
 - **Conformer Transducer** (Reference: [https://arxiv.org/abs/2005.08100](https://arxiv.org/abs/2005.08100))
   See [examples/conformer](./examples/conformer)
 - **Streaming Transducer** (Reference: [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621))
   See [examples/streaming_transducer](./examples/streaming_transducer)
 - **ContextNet** (Reference: [http://arxiv.org/abs/2005.03191](http://arxiv.org/abs/2005.03191))
   See [examples/contextnet](./examples/contextnet)
+- **Deep Speech 2** (Reference: [https://arxiv.org/abs/1512.02595](https://arxiv.org/abs/1512.02595))
+  See [examples/deepspeech2](./examples/deepspeech2)
+- **Jasper** (Reference: [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288))
+  See [examples/jasper](./examples/jasper)
 
 ## Installation
 
 For training and testing, you should use `git clone` for installing necessary packages from other authors (`ctc_decoders`, `rnnt_loss`, etc.)
 
-### Installing via PyPi
-
-For tensorflow 2.3.x, run `pip3 install -U 'TensorFlowASR[tf2.3]'` or `pip3 install -U 'TensorFlowASR[tf2.3-gpu]'`
-
-For tensorflow 2.4.x, run `pip3 install -U 'TensorFlowASR[tf2.4]'` or `pip3 install -U 'TensorFlowASR[tf2.4-gpu]'`
-
-For tensorflow 2.5.x, run `pip3 install -U 'TensorFlowASR[tf2.5]'` or `pip3 install -U 'TensorFlowASR[tf2.5-gpu]'`
-
-For tensorflow 2.6.x, run `pip3 install -U 'TensorFlowASR[tf2.6]'` or `pip3 install -U 'TensorFlowASR[tf2.6-gpu]'`
-
-### Installing from source
+### Installing from source (recommended)
 
 ```bash
 git clone https://github.com/TensorSpeech/TensorFlowASR.git
 cd TensorFlowASR
-pip3 install -e '.[tf2.6]' # see other options in setup.py file
+# Tensorflow 2.x (with 2.x >= 2.3)
+pip3 install -e ".[tf2.x]" # or ".[tf2.x-gpu]"
 ```
 
 For anaconda3:
@@ -105,9 +96,18 @@ conda activate tfasr
 pip install -U tensorflow-gpu # upgrade to latest version of tensorflow
 git clone https://github.com/TensorSpeech/TensorFlowASR.git
 cd TensorFlowASR
-pip3 install '.[tf2.3]' # or '.[tf2.3-gpu]' or '.[tf2.4]' or '.[tf2.4-gpu]' or '.[tf2.5]' or '.[tf2.5-gpu]' 
+# Tensorflow 2.x (with 2.x >= 2.3)
+pip3 install -e ".[tf2.x]" # or ".[tf2.x-gpu]"
 ```
 
+### Installing via PyPi
+
+```bash
+# Tensorflow 2.x (with 2.x >= 2.3)
+pip3 install -U "TensorFlowASR[tf2.x]" # or pip3 install -U "TensorFlowASR[tf2.x-gpu]"
+```
+
+
 ### Running in a container
 
 ```bash

diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml
@@ -31,7 +31,7 @@ decoder_config:
   beam_width: 0
   norm_score: True
   corpus_files:
-    - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    - /mnt/Data/MLDL/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
 
 model_config:
   name: conformer
@@ -75,8 +75,8 @@ learning_config:
           num_masks: 1
           mask_factor: 27
     data_paths:
-      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/tfrecords_1030
+      - /mnt/Data/MLDL/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: /mnt/Data/MLDL/Datasets/ASR/Raw/LibriSpeech/tfrecords_1030
     shuffle: True
     cache: True
     buffer_size: 100
@@ -86,7 +86,7 @@ learning_config:
   eval_dataset_config:
     use_tf: True
     data_paths: null
-    tfrecords_dir: /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/tfrecords_1030
+    tfrecords_dir: /mnt/Data/MLDL/Datasets/ASR/Raw/LibriSpeech/tfrecords_1030
     shuffle: False
     cache: True
     buffer_size: 100
@@ -113,13 +113,13 @@ learning_config:
     batch_size: 2
     num_epochs: 50
     checkpoint:
-      filepath: /mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5
-      save_best_only: True
+      filepath: /mnt/Miscellanea/Models/local/conformer/checkpoints/{epoch:02d}.h5
+      save_best_only: False
       save_weights_only: True
       save_freq: epoch
-    states_dir: /mnt/e/Models/local/conformer/states
+    states_dir: /mnt/Miscellanea/Models/local/conformer/states
     tensorboard:
-      log_dir: /mnt/e/Models/local/conformer/tensorboard
+      log_dir: /mnt/Miscellanea/Models/local/conformer/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True

diff --git a/examples/conformer/saved_model.py b/examples/conformer/saved_model.py
@@ -26,40 +26,15 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Testing")
 
-parser.add_argument(
-    "--config",
-    type=str,
-    default=DEFAULT_YAML,
-    help="The file path of model configuration file",
-)
-
-parser.add_argument(
-    "--h5",
-    type=str,
-    default=None,
-    help="Path to saved h5 weights",
-)
-
-parser.add_argument(
-    "--sentence_piece",
-    default=False,
-    action="store_true",
-    help="Whether to use `SentencePiece` model",
-)
-
-parser.add_argument(
-    "--subwords",
-    default=False,
-    action="store_true",
-    help="Use subwords",
-)
-
-parser.add_argument(
-    "--output_dir",
-    type=str,
-    default=None,
-    help="Output directory for saved model",
-)
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+
+parser.add_argument("--h5", type=str, default=None, help="Path to saved h5 weights")
+
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
+parser.add_argument("--output_dir", type=str, default=None, help="Output directory for saved model")
 
 args = parser.parse_args()
 
@@ -94,23 +69,14 @@
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
 
-class aModule(tf.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    @tf.function(
-        input_signature=[
-            {
-                "inputs": tf.TensorSpec(shape=[None, None, 80, 1], dtype=tf.float32, name="inputs"),
-                "inputs_length": tf.TensorSpec(shape=[None], dtype=tf.int32, name="inputs_length"),
-            }
-        ]
-    )
-    def pred(self, input_batch):
-        result = self.model.recognize(input_batch)
-        return {"ASR": result}
+# TODO: Support saved model conversion
+# class ConformerModule(tf.Module):
+#     def __init__(self, model: Conformer, name=None):
+#         super().__init__(name=name)
+#         self.model = model
+#         self.pred = model.make_tflite_function()
 
 
-module = aModule(conformer)
-tf.saved_model.save(module, args.output_dir, signatures={"serving_default": module.pred})
+# model = ConformerModule(model=conformer)
+# tf.saved_model.save(model, args.output_dir)
+conformer.save(args.output_dir, include_optimizer=False, save_format="tf")
diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml
@@ -247,7 +247,7 @@ learning_config:
     num_epochs: 20
     checkpoint:
       filepath: /mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5
-      save_best_only: True
+      save_best_only: False
       save_weights_only: True
       save_freq: epoch
     states_dir: /mnt/e/Models/local/contextnet/states

diff --git a/examples/deepspeech2/config.yml b/examples/deepspeech2/config.yml
@@ -91,7 +91,7 @@ learning_config:
     num_epochs: 20
     checkpoint:
       filepath: /mnt/e/Models/local/deepspeech2/checkpoints/{epoch:02d}.h5
-      save_best_only: True
+      save_best_only: False
       save_weights_only: True
       save_freq: epoch
     states_dir: /mnt/e/Models/local/deepspeech2/states

diff --git a/examples/jasper/config.yml b/examples/jasper/config.yml
@@ -98,7 +98,7 @@ learning_config:
     num_epochs: 20
     checkpoint:
       filepath: /mnt/e/Models/local/jasper/checkpoints/{epoch:02d}.h5
-      save_best_only: True
+      save_best_only: False
       save_weights_only: True
       save_freq: epoch
     states_dir: /mnt/e/Models/local/jasper/states

diff --git a/examples/rnn_transducer/config.yml b/examples/rnn_transducer/config.yml
@@ -106,7 +106,7 @@ learning_config:
     num_epochs: 20
     checkpoint:
       filepath: /mnt/e/Models/local/rnn_transducer/checkpoints/{epoch:02d}.h5
-      save_best_only: True
+      save_best_only: False
       save_weights_only: True
       save_freq: epoch
     states_dir: /mnt/e/Models/local/rnn_transducer/states

diff --git a/notebooks/conformer.ipynb b/notebooks/conformer.ipynb
@@ -1,30 +1,4 @@
 {
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.8-final"
-  },
-  "orig_nbformat": 2,
-  "kernelspec": {
-   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f",
-   "display_name": "Python 3.8.8 64-bit ('tfo': conda)"
-  },
-  "metadata": {
-   "interpreter": {
-    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2,
  "cells": [
   {
    "cell_type": "code",
@@ -137,7 +111,7 @@
     "            \"num_epochs\": 50,\n",
     "            \"checkpoint\": {\n",
     "                \"filepath\": \"/mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5\",\n",
-    "                \"save_best_only\": True,\n",
+    "                \"save_best_only\": False,\n",
     "                \"save_weights_only\": True,\n",
     "                \"save_freq\": \"epoch\",\n",
     "            },\n",
@@ -265,5 +239,31 @@
    "outputs": [],
    "source": []
   }
- ]
-}
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.8 64-bit ('tfo': conda)",
+   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8-final"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+   }
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/contextnet.ipynb b/notebooks/contextnet.ipynb
@@ -1,30 +1,4 @@
 {
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.8-final"
-  },
-  "orig_nbformat": 2,
-  "kernelspec": {
-   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f",
-   "display_name": "Python 3.8.8 64-bit ('tfo': conda)"
-  },
-  "metadata": {
-   "interpreter": {
-    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2,
  "cells": [
   {
    "cell_type": "code",
@@ -308,7 +282,7 @@
     "            \"num_epochs\": 20,\n",
     "            \"checkpoint\": {\n",
     "                \"filepath\": \"/mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5\",\n",
-    "                \"save_best_only\": True,\n",
+    "                \"save_best_only\": False,\n",
     "                \"save_weights_only\": True,\n",
     "                \"save_freq\": \"epoch\",\n",
     "            },\n",
@@ -429,5 +403,31 @@
     ")"
    ]
   }
- ]
-}
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.8 64-bit ('tfo': conda)",
+   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8-final"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+   }
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}