Skip to content

Commit

Permalink
Merge pull request #63 from TensorSpeech/dev/decoding
Browse files Browse the repository at this point in the history
Supported New RNN Transducer Beam Search
  • Loading branch information
nglehuy authored Dec 6, 2020
2 parents b7cb5d3 + d3cfb4d commit a9b0850
Show file tree
Hide file tree
Showing 25 changed files with 495 additions and 463 deletions.
10 changes: 7 additions & 3 deletions examples/conformer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ TFLite Conversion, see `python examples/conformer/tflite_conformer.py --help`

**Error Rates**

| Test-clean | WER (%) | CER (%) |
| :--------: | :-------: | :--------: |
| _Greedy_ | 6.4476862 | 2.51828337 |
| **Test-clean** | WER (%) | CER (%) |
| :------------: | :-------: | :--------: |
| _Greedy_ | 6.4476862 | 2.51828337 |

| **Test-other** | WER (%) | CER (%) |
| :------------: | :--------: | :--------: |
| _Greedy_ | 15.7308521 | 7.67273521 |
35 changes: 18 additions & 17 deletions examples/conformer/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,27 +33,28 @@ decoder_config:

model_config:
name: conformer
subsampling:
encoder_subsampling:
type: conv2d
filters: 144
kernel_size: 3
strides: 2
positional_encoding: sinusoid_concat
dmodel: 144
num_blocks: 16
head_size: 36
num_heads: 4
mha_type: relmha
kernel_size: 32
fc_factor: 0.5
dropout: 0.1
embed_dim: 320
embed_dropout: 0.1
num_rnns: 1
rnn_units: 320
rnn_type: lstm
layer_norm: True
projection_units: 0
encoder_positional_encoding: sinusoid_concat
encoder_dmodel: 144
encoder_num_blocks: 16
encoder_head_size: 36
encoder_num_heads: 4
encoder_mha_type: relmha
encoder_kernel_size: 32
encoder_fc_factor: 0.5
encoder_dropout: 0.1
prediction_embed_dim: 320
prediction_embed_dropout: 0.1
prediction_num_rnns: 1
prediction_rnn_units: 320
prediction_rnn_type: lstm
prediction_rnn_implementation: 1
prediction_layer_norm: True
prediction_projection_units: 0
joint_dim: 320

learning_config:
Expand Down
6 changes: 3 additions & 3 deletions examples/conformer/test_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
Expand All @@ -67,15 +67,15 @@
assert args.saved

if args.tfrecords:
test_dataset = ASRTFRecordDataset(
test_dataset = ASRTFRecordTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
stage="test", shuffle=False
)
else:
test_dataset = ASRSliceDataset(
test_dataset = ASRSliceTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
Expand Down
6 changes: 3 additions & 3 deletions examples/conformer/test_subword_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
Expand All @@ -75,15 +75,15 @@
assert args.saved

if args.tfrecords:
test_dataset = ASRTFRecordDataset(
test_dataset = ASRTFRecordTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
stage="test", shuffle=False
)
else:
test_dataset = ASRSliceDataset(
test_dataset = ASRSliceTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
Expand Down
13 changes: 6 additions & 7 deletions examples/conformer/train_ga_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,15 @@
conformer._build(speech_featurizer.shape)
conformer.summary(line_length=120)

optimizer_config = config.learning_config.optimizer_config
optimizer = tf.keras.optimizers.Adam(
TransformerSchedule(
d_model=config.model_config["dmodel"],
warmup_steps=optimizer_config["warmup_steps"],
max_lr=(0.05 / math.sqrt(config.model_config["dmodel"]))
d_model=config.model_config["encoder_dmodel"],
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
),
beta_1=optimizer_config["beta1"],
beta_2=optimizer_config["beta2"],
epsilon=optimizer_config["epsilon"]
beta_1=config.learning_config.optimizer_config["beta1"],
beta_2=config.learning_config.optimizer_config["beta2"],
epsilon=config.learning_config.optimizer_config["epsilon"]
)

conformer_trainer.compile(model=conformer, optimizer=optimizer,
Expand Down
13 changes: 6 additions & 7 deletions examples/conformer/train_ga_subword_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,15 @@
conformer._build(speech_featurizer.shape)
conformer.summary(line_length=120)

optimizer_config = config.learning_config.optimizer_config
optimizer = tf.keras.optimizers.Adam(
TransformerSchedule(
d_model=config.model_config["dmodel"],
warmup_steps=optimizer_config["warmup_steps"],
max_lr=(0.05 / math.sqrt(config.model_config["dmodel"]))
d_model=config.model_config["encoder_dmodel"],
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
),
beta_1=optimizer_config["beta1"],
beta_2=optimizer_config["beta2"],
epsilon=optimizer_config["epsilon"]
beta_1=config.learning_config.optimizer_config["beta1"],
beta_2=config.learning_config.optimizer_config["beta2"],
epsilon=config.learning_config.optimizer_config["epsilon"]
)

conformer_trainer.compile(model=conformer, optimizer=optimizer,
Expand Down
13 changes: 6 additions & 7 deletions examples/conformer/train_subword_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,15 @@
conformer._build(speech_featurizer.shape)
conformer.summary(line_length=120)

optimizer_config = config.learning_config.optimizer_config
optimizer = tf.keras.optimizers.Adam(
TransformerSchedule(
d_model=config.model_config["dmodel"],
warmup_steps=optimizer_config["warmup_steps"],
max_lr=(0.05 / math.sqrt(config.model_config["dmodel"]))
d_model=config.model_config["encoder_dmodel"],
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
),
beta_1=optimizer_config["beta1"],
beta_2=optimizer_config["beta2"],
epsilon=optimizer_config["epsilon"]
beta_1=config.learning_config.optimizer_config["beta1"],
beta_2=config.learning_config.optimizer_config["beta2"],
epsilon=config.learning_config.optimizer_config["epsilon"]
)

conformer_trainer.compile(model=conformer, optimizer=optimizer,
Expand Down
6 changes: 3 additions & 3 deletions examples/deepspeech2/test_ds2.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
setup_devices([args.device])

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
Expand All @@ -70,15 +70,15 @@
ds2_model.add_featurizers(speech_featurizer, text_featurizer)

if args.tfrecords:
test_dataset = ASRTFRecordDataset(
test_dataset = ASRTFRecordTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
stage="test", shuffle=False
)
else:
test_dataset = ASRSliceDataset(
test_dataset = ASRSliceTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
Expand Down
27 changes: 21 additions & 6 deletions examples/demonstration/conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import argparse
from tensorflow_asr.utils import setup_environment, setup_devices

Expand All @@ -32,6 +33,8 @@
parser.add_argument("--blank", type=int, default=0,
help="Path to conformer tflite")

parser.add_argument("--beam_width", type=int, default=0, help="Beam width")

parser.add_argument("--num_rnns", type=int, default=1,
help="Number of RNN layers in prediction network")

Expand All @@ -47,19 +50,30 @@
parser.add_argument("--cpu", default=False, action="store_true",
help="Whether to only use cpu")

parser.add_argument("--subwords", type=str, default=None,
help="Path to file that stores generated subwords")

parser.add_argument("--output_name", type=str, default="test",
help="Result filename name prefix")

args = parser.parse_args()

setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer
from tensorflow_asr.models.conformer import Conformer

config = Config(args.config, learning=False)
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
text_featurizer = CharFeaturizer(config.decoder_config)
if args.subwords and os.path.exists(args.subwords):
print("Loading subwords ...")
text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
else:
text_featurizer = CharFeaturizer(config.decoder_config)
text_featurizer.decoder_config.beam_width = args.beam_width

# build model
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
Expand All @@ -69,9 +83,10 @@
conformer.add_featurizers(speech_featurizer, text_featurizer)

signal = read_raw_audio(args.filename)
predicted = tf.constant(args.blank, dtype=tf.int32)
states = tf.zeros([args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)

hyp, _, _ = conformer.recognize_tflite(signal, predicted, states)
if (args.beam_width):
transcript = conformer.recognize_beam(signal[None, ...])
else:
transcript = conformer.recognize(signal[None, ...])

print("".join([chr(u) for u in hyp]))
tf.print("Transcript:", transcript[0])
6 changes: 3 additions & 3 deletions examples/jasper/test_jasper.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
setup_devices([args.device])

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
Expand All @@ -70,15 +70,15 @@
jasper.add_featurizers(speech_featurizer, text_featurizer)

if args.tfrecords:
test_dataset = ASRTFRecordDataset(
test_dataset = ASRTFRecordTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
stage="test", shuffle=False
)
else:
test_dataset = ASRSliceDataset(
test_dataset = ASRSliceTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
Expand Down
6 changes: 3 additions & 3 deletions examples/streaming_transducer/test_streaming_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
Expand All @@ -67,15 +67,15 @@
assert args.saved

if args.tfrecords:
test_dataset = ASRTFRecordDataset(
test_dataset = ASRTFRecordTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
stage="test", shuffle=False
)
else:
test_dataset = ASRSliceDataset(
test_dataset = ASRSliceTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
setup_devices([args.device], cpu=args.cpu)

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
from tensorflow_asr.runners.base_runners import BaseTester
Expand All @@ -75,15 +75,15 @@
assert args.saved

if args.tfrecords:
test_dataset = ASRTFRecordDataset(
test_dataset = ASRTFRecordTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
stage="test", shuffle=False
)
else:
test_dataset = ASRSliceDataset(
test_dataset = ASRSliceTestDataset(
data_paths=config.learning_config.dataset_config.test_paths,
speech_featurizer=speech_featurizer,
text_featurizer=text_featurizer,
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[flake8]
ignore = E402,E701,E702,E704,E251
max-line-length = 100
max-line-length = 150

[pep8]
ignore = E402,E701,E702,E704,E251
max-line-length = 100
max-line-length = 150
indent-size = 4
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

setuptools.setup(
name="TensorFlowASR",
version="0.3.2",
version="0.4.0",
author="Huy Le Nguyen",
author_email="[email protected]",
description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
Expand Down
Loading

0 comments on commit a9b0850

Please sign in to comment.