Merge pull request #115 from TensorSpeech/fix/tflite

Fix TFLite Conversion and Interpretation
TensorSpeech · Jan 15, 2021 · 86e8c43 · 86e8c43
2 parents 304330a + 6113267
commit 86e8c43
Show file tree

Hide file tree

Showing 10 changed files with 239 additions and 112 deletions.
diff --git a/examples/conformer/tflite_conformer.py b/examples/conformer/tflite_conformer.py
@@ -54,11 +54,10 @@
 conformer.summary(line_length=150)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
-concrete_func = conformer.make_tflite_function(greedy=True).get_concrete_function()
+concrete_func = conformer.make_tflite_function().get_concrete_function()
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
 tflite_model = converter.convert()
 
 if not os.path.exists(os.path.dirname(args.output)):

diff --git a/examples/conformer/tflite_subword_conformer.py b/examples/conformer/tflite_subword_conformer.py
@@ -62,7 +62,7 @@
 conformer.summary(line_length=150)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
-concrete_func = conformer.make_tflite_function(greedy=True).get_concrete_function()
+concrete_func = conformer.make_tflite_function().get_concrete_function()
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 converter.experimental_new_converter = True
 converter.optimizations = [tf.lite.Optimize.DEFAULT]

diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
 
 setuptools.setup(
     name="TensorFlowASR",
-    version="0.6.3",
+    version="0.6.4",
     author="Huy Le Nguyen",
     author_email="[email protected]",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",

diff --git a/tensorflow_asr/models/contextnet.py b/tensorflow_asr/models/contextnet.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """ Ref: https://github.com/iankur/ContextNet """
 
-from typing import List, Optional
+from typing import List
 import tensorflow as tf
 from .transducer import Transducer
 from ..utils.utils import merge_two_last_dims, get_reduced_length
@@ -245,13 +245,94 @@ def call(self, inputs, training=False, **kwargs):
         outputs = self.joint_net([enc, pred], training=training, **kwargs)
         return outputs
 
-    def encoder_inference(self,
-                          features: tf.Tensor,
-                          input_length: Optional[tf.Tensor] = None,
-                          with_batch: bool = False):
+    def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor):
         with tf.name_scope(f"{self.name}_encoder"):
-            if with_batch: return self.encoder([features, input_length], training=False)
             input_length = tf.expand_dims(tf.shape(features)[0], axis=0)
             outputs = tf.expand_dims(features, axis=0)
             outputs = self.encoder([outputs, input_length], training=False)
             return tf.squeeze(outputs, axis=0)
+
+    # -------------------------------- GREEDY -------------------------------------
+
+    @tf.function
+    def recognize(self,
+                  features: tf.Tensor,
+                  input_length: tf.Tensor,
+                  parallel_iterations: int = 10,
+                  swap_memory: bool = True):
+        """
+        RNN Transducer Greedy decoding
+        Args:
+            features (tf.Tensor): a batch of padded extracted features
+
+        Returns:
+            tf.Tensor: a batch of decoded transcripts
+        """
+        encoded = self.encoder([features, input_length], training=False)
+        return self._perform_greedy_batch(encoded, input_length,
+                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+
+    def recognize_tflite(self, signal, predicted, prediction_states):
+        """
+        Function to convert to tflite using greedy decoding (default streaming mode)
+        Args:
+            signal: tf.Tensor with shape [None] indicating a single audio signal
+            predicted: last predicted character with shape []
+            prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P]
+
+        Return:
+            transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32
+            predicted: last predicted character with shape []
+            encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P]
+            prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P]
+        """
+        features = self.speech_featurizer.tf_extract(signal)
+        encoded = self.encoder_inference(features, tf.shape(features)[0])
+        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states)
+        transcript = self.text_featurizer.indices2upoints(hypothesis.prediction)
+        return transcript, hypothesis.index, hypothesis.states
+
+    def recognize_tflite_with_timestamp(self, signal, predicted, states):
+        features = self.speech_featurizer.tf_extract(signal)
+        encoded = self.encoder_inference(features, tf.shape(features)[0])
+        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states)
+        indices = self.text_featurizer.normalize_indices(hypothesis.prediction)
+        upoints = tf.gather_nd(self.text_featurizer.upoints, tf.expand_dims(indices, axis=-1))  # [None, max_subword_length]
+
+        num_samples = tf.cast(tf.shape(signal)[0], dtype=tf.float32)
+        total_time_reduction_factor = self.time_reduction_factor * self.speech_featurizer.frame_step
+
+        stime = tf.range(0, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
+        stime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32)
+
+        etime = tf.range(total_time_reduction_factor, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
+        etime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32)
+
+        non_blank = tf.where(tf.not_equal(upoints, 0))
+        non_blank_transcript = tf.gather_nd(upoints, non_blank)
+        non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
+        non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
+
+        return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, hypothesis.states
+
+    # -------------------------------- BEAM SEARCH -------------------------------------
+
+    @tf.function
+    def recognize_beam(self,
+                       features: tf.Tensor,
+                       input_length: tf.Tensor,
+                       lm: bool = False,
+                       parallel_iterations: int = 10,
+                       swap_memory: bool = True):
+        """
+        RNN Transducer Beam Search
+        Args:
+            features (tf.Tensor): a batch of padded extracted features
+            lm (bool, optional): whether to use language model. Defaults to False.
+
+        Returns:
+            tf.Tensor: a batch of decoded transcripts
+        """
+        encoded = self.encoder([features, input_length], training=False)
+        return self._perform_beam_search_batch(encoded, input_length, lm,
+                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
diff --git a/tensorflow_asr/models/streaming_transducer.py b/tensorflow_asr/models/streaming_transducer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """ http://arxiv.org/abs/1811.06621 """
 
-from typing import Optional
 import tensorflow as tf
 
 from .layers.subsampling import TimeReduction
@@ -225,24 +224,18 @@ def __init__(self,
         )
         self.time_reduction_factor = self.encoder.time_reduction_factor
 
-    def encoder_inference(self,
-                          features: tf.Tensor,
-                          states: tf.Tensor,
-                          input_length: Optional[tf.Tensor] = None,
-                          with_batch: bool = False):
+    def encoder_inference(self, features: tf.Tensor, states: tf.Tensor):
         """Infer function for encoder (or encoders)
 
         Args:
             features (tf.Tensor): features with shape [T, F, C]
             states (tf.Tensor): previous states of encoders with shape [num_rnns, 1 or 2, 1, P]
-            with_batch (bool): indicates whether the features included batch dim or not
 
         Returns:
             tf.Tensor: output of encoders with shape [T, E]
             tf.Tensor: states of encoders with shape [num_rnns, 1 or 2, 1, P]
         """
         with tf.name_scope(f"{self.name}_encoder"):
-            if with_batch: return self.encoder.recognize(features, states)
             outputs = tf.expand_dims(features, axis=0)
             outputs, new_states = self.encoder.recognize(outputs, states)
             return tf.squeeze(outputs, axis=0), new_states
@@ -263,11 +256,7 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded, _ = self.encoder_inference(
-            features,
-            self.encoder.get_initial_state(),
-            input_length=input_length, with_batch=True
-        )
+        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state())
         return self._perform_greedy_batch(encoded, input_length,
                                           parallel_iterations=parallel_iterations, swap_memory=swap_memory)
 
@@ -290,12 +279,7 @@ def recognize_tflite(self, signal, predicted, encoder_states, prediction_states)
         encoded, new_encoder_states = self.encoder_inference(features, encoder_states)
         hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states)
         transcript = self.text_featurizer.indices2upoints(hypothesis.prediction)
-        return (
-            transcript,
-            hypothesis.prediction[-1],
-            new_encoder_states,
-            hypothesis.states
-        )
+        return transcript, hypothesis.index, new_encoder_states, hypothesis.states
 
     def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, prediction_states):
         features = self.speech_featurizer.tf_extract(signal)
@@ -318,14 +302,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, pre
         non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
         non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
 
-        return (
-            non_blank_transcript,
-            non_blank_stime,
-            non_blank_etime,
-            hypothesis.prediction,
-            new_encoder_states,
-            hypothesis.states
-        )
+        return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, new_encoder_states, hypothesis.states
 
     # -------------------------------- BEAM SEARCH -------------------------------------
 
@@ -345,11 +322,7 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded, _ = self.encoder_inference(
-            features,
-            self.encoder.get_initial_state(),
-            input_length=input_length, with_batch=True
-        )
+        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state())
         return self._perform_beam_search_batch(encoded, input_length, lm,
                                                parallel_iterations=parallel_iterations, swap_memory=swap_memory)
 

diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer.py
@@ -14,7 +14,6 @@
 """ https://arxiv.org/pdf/1811.06621.pdf """
 
 import collections
-from typing import Optional
 import tensorflow as tf
 
 from . import Model
@@ -285,22 +284,16 @@ def call(self, inputs, training=False, **kwargs):
         outputs = self.joint_net([enc, pred], training=training, **kwargs)
         return outputs
 
-    def encoder_inference(self,
-                          features: tf.Tensor,
-                          input_length: Optional[tf.Tensor] = None,
-                          with_batch: Optional[bool] = False):
+    def encoder_inference(self, features: tf.Tensor):
         """Infer function for encoder (or encoders)
 
         Args:
             features (tf.Tensor): features with shape [T, F, C]
-            input_length (tf.Tensor): optional features length with shape []
-            with_batch (bool): indicates whether the features included batch dim or not
 
         Returns:
             tf.Tensor: output of encoders with shape [T, E]
         """
         with tf.name_scope(f"{self.name}_encoder"):
-            if with_batch: return self.encoder(features, training=False)
             outputs = tf.expand_dims(features, axis=0)
             outputs = self.encoder(outputs, training=False)
             return tf.squeeze(outputs, axis=0)
@@ -321,7 +314,7 @@ def decoder_inference(self, encoded: tf.Tensor, predicted: tf.Tensor, states: tf
             predicted = tf.reshape(predicted, [1, 1])  # [] => [1, 1]
             y, new_states = self.predict_net.recognize(predicted, states)  # [1, 1, P], states
             ytu = tf.nn.log_softmax(self.joint_net([encoded, y], training=False))  # [1, 1, V]
-            ytu = tf.squeeze(ytu, axis=None)  # [1, 1, V] => [V]
+            ytu = tf.reshape(ytu, shape=[-1])  # [1, 1, V] => [V]
             return ytu, new_states
 
     def get_config(self):
@@ -347,7 +340,7 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder_inference(features, input_length, with_batch=True)
+        encoded = self.encoder(features, training=True)
         return self._perform_greedy_batch(encoded, input_length,
                                           parallel_iterations=parallel_iterations, swap_memory=swap_memory)
 
@@ -368,11 +361,7 @@ def recognize_tflite(self, signal, predicted, states):
         encoded = self.encoder_inference(features)
         hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states)
         transcript = self.text_featurizer.indices2upoints(hypothesis.prediction)
-        return (
-            transcript,
-            hypothesis.prediction[-1],
-            hypothesis.states
-        )
+        return transcript, hypothesis.index, hypothesis.states
 
     def recognize_tflite_with_timestamp(self, signal, predicted, states):
         features = self.speech_featurizer.tf_extract(signal)
@@ -395,7 +384,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, states):
         non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
         non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
 
-        return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.prediction, hypothesis.states
+        return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, hypothesis.states
 
     def _perform_greedy_batch(self,
                               encoded: tf.Tensor,
@@ -450,48 +439,47 @@ def _perform_greedy(self,
             total = encoded_length
 
             hypothesis = Hypothesis(
-                index=tf.constant(self.text_featurizer.blank, dtype=tf.int32),
-                prediction=tf.ones([total], dtype=tf.int32) * self.text_featurizer.blank,
+                index=predicted,
+                prediction=tf.TensorArray(
+                    dtype=tf.int32, size=total, dynamic_size=False,
+                    clear_after_read=False, element_shape=tf.TensorShape([])
+                ),
                 states=states
             )
 
-            def condition(time, total, encoded, hypothesis): return tf.less(time, total)
+            def condition(_time, _total, _encoded, _hypothesis): return tf.less(_time, _total)
 
-            def body(time, total, encoded, hypothesis):
-                ytu, states = self.decoder_inference(
+            def body(_time, _total, _encoded, _hypothesis):
+                ytu, _states = self.decoder_inference(
                     # avoid using [index] in tflite
-                    encoded=tf.gather_nd(encoded, tf.expand_dims(time, axis=-1)),
-                    predicted=hypothesis.index,
-                    states=hypothesis.states
+                    encoded=tf.gather_nd(_encoded, tf.reshape(_time, shape=[1])),
+                    predicted=_hypothesis.index,
+                    states=_hypothesis.states
                 )
-                predict = tf.argmax(ytu, axis=-1, output_type=tf.int32)  # => argmax []
+                _predict = tf.argmax(ytu, axis=-1, output_type=tf.int32)  # => argmax []
 
-                index, predict, states = tf.cond(
-                    tf.equal(predict, self.text_featurizer.blank),
-                    true_fn=lambda: (hypothesis.index, predict, hypothesis.states),
-                    false_fn=lambda: (predict, predict, states)  # update if the new prediction is a non-blank
-                )
+                # something is wrong with tflite that drop support for tf.cond
+                # def equal_blank_fn(): return _hypothesis.index, _hypothesis.states
+                # def non_equal_blank_fn(): return _predict, _states  # update if the new prediction is a non-blank
+                # _index, _states = tf.cond(tf.equal(_predict, blank), equal_blank_fn, non_equal_blank_fn)
 
-                hypothesis = Hypothesis(
-                    index=index,
-                    prediction=tf.tensor_scatter_nd_update(
-                        hypothesis.prediction,
-                        indices=tf.reshape(time, [1, 1]),
-                        updates=tf.expand_dims(predict, axis=-1)
-                    ),
-                    states=states
-                )
+                _equal = tf.equal(_predict, self.text_featurizer.blank)
+                _index = tf.where(_equal, _hypothesis.index, _predict)
+                _states = tf.where(_equal, _hypothesis.states, _states)
+
+                _prediction = _hypothesis.prediction.write(_time, _predict)
+                _hypothesis = Hypothesis(index=_index, prediction=_prediction, states=_states)
 
-                return time + 1, total, encoded, hypothesis
+                return _time + 1, _total, _encoded, _hypothesis
 
-            time, total, encoded, hypothesis = tf.while_loop(
+            _, _, _, hypothesis = tf.while_loop(
                 condition, body,
                 loop_vars=[time, total, encoded, hypothesis],
                 parallel_iterations=parallel_iterations,
                 swap_memory=swap_memory
             )
 
-            return hypothesis
+            return Hypothesis(index=hypothesis.index, prediction=hypothesis.prediction.stack(), states=hypothesis.states)
 
     # -------------------------------- BEAM SEARCH -------------------------------------
 
@@ -511,7 +499,7 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder_inference(features, input_length, with_batch=True)
+        encoded = self.encoder(features, training=True)
         return self._perform_beam_search_batch(encoded, input_length, lm,
                                                parallel_iterations=parallel_iterations, swap_memory=swap_memory)