From d2c52dbc10277229377e9b56b11c75d2afdfa584 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Mon, 23 Oct 2017 17:47:22 +0800
Subject: [PATCH 1/2] Give option to disable converting from transcription text
 to ids.

---
 deep_speech_2/data_utils/data.py                    | 13 ++++++++++---
 .../data_utils/featurizer/speech_featurizer.py      |  8 +++++---
 deep_speech_2/deploy/demo_server.py                 |  3 ++-
 deep_speech_2/infer.py                              |  6 +++---
 deep_speech_2/test.py                               |  6 +++---
 deep_speech_2/tools/tune.py                         |  6 +++---
 6 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py
index 71ba2434f2..edd4047ef3 100644
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
@@ -55,6 +55,10 @@ class DataGenerator(object):
     :type num_threads: int
     :param random_seed: Random seed.
     :type random_seed: int
+    :param keep_transcription_text: If set to True, transcription text will
+                                    be passed forward directly without
+                                    converting to index sequence.
+    :type keep_transcription_text: bool
     """
 
     def __init__(self,
@@ -69,7 +73,8 @@ def __init__(self,
                  specgram_type='linear',
                  use_dB_normalization=True,
                  num_threads=multiprocessing.cpu_count() // 2,
-                 random_seed=0):
+                 random_seed=0,
+                 keep_transcription_text=False):
         self._max_duration = max_duration
         self._min_duration = min_duration
         self._normalizer = FeatureNormalizer(mean_std_filepath)
@@ -84,6 +89,7 @@ def __init__(self,
             use_dB_normalization=use_dB_normalization)
         self._num_threads = num_threads
         self._rng = random.Random(random_seed)
+        self._keep_transcription_text = keep_transcription_text
         self._epoch = 0
         # for caching tar files info
         self._local_data = local()
@@ -107,9 +113,10 @@ def process_utterance(self, filename, transcript):
         else:
             speech_segment = SpeechSegment.from_file(filename, transcript)
         self._augmentation_pipeline.transform_audio(speech_segment)
-        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
+        specgram, transcript_part = self._speech_featurizer.featurize(
+            speech_segment, self._keep_transcription_text)
         specgram = self._normalizer.apply(specgram)
-        return specgram, text_ids
+        return specgram, transcript_part
 
     def batch_reader_creator(self,
                              manifest_path,
diff --git a/deep_speech_2/data_utils/featurizer/speech_featurizer.py b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
index a947588db4..4555dc31da 100644
--- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
@@ -60,12 +60,12 @@ def __init__(self,
             target_dB=target_dB)
         self._text_featurizer = TextFeaturizer(vocab_filepath)
 
-    def featurize(self, speech_segment):
+    def featurize(self, speech_segment, keep_transcription_text):
         """Extract features for speech segment.
 
         1. For audio parts, extract the audio features.
-        2. For transcript parts, convert text string to a list of token indices
-           in char-level.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
 
         :param audio_segment: Speech segment to extract features from.
         :type audio_segment: SpeechSegment
@@ -74,6 +74,8 @@ def featurize(self, speech_segment):
         :rtype: tuple
         """
         audio_feature = self._audio_featurizer.featurize(speech_segment)
+        if keep_transcription_text:
+            return audio_feature, speech_segment.transcript
         text_ids = self._text_featurizer.featurize(speech_segment.transcript)
         return audio_feature, text_ids
 
diff --git a/deep_speech_2/deploy/demo_server.py b/deep_speech_2/deploy/demo_server.py
index b007c751e7..3e81c0c5b9 100644
--- a/deep_speech_2/deploy/demo_server.py
+++ b/deep_speech_2/deploy/demo_server.py
@@ -146,7 +146,8 @@ def start_server():
         mean_std_filepath=args.mean_std_path,
         augmentation_config='{}',
         specgram_type=args.specgram_type,
-        num_threads=1)
+        num_threads=1,
+        keep_transcription_text=True)
     # prepare ASR model
     ds2_model = DeepSpeech2Model(
         vocab_size=data_generator.vocab_size,
diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py
index a30d48d6de..74524602aa 100644
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -68,7 +68,8 @@ def infer():
         mean_std_filepath=args.mean_std_path,
         augmentation_config='{}',
         specgram_type=args.specgram_type,
-        num_threads=1)
+        num_threads=1,
+        keep_transcription_text=True)
     batch_reader = data_generator.batch_reader_creator(
         manifest_path=args.infer_manifest,
         batch_size=args.num_samples,
@@ -103,8 +104,7 @@ def infer():
 
     error_rate_func = cer if args.error_rate_type == 'cer' else wer
     target_transcripts = [
-        ''.join([data_generator.vocab_list[token] for token in transcript])
-        for _, transcript in infer_data
+        transcript for _, transcript in infer_data
     ]
     for target, result in zip(target_transcripts, result_transcripts):
         print("\nTarget Transcription: %s\nOutput Transcription: %s" %
diff --git a/deep_speech_2/test.py b/deep_speech_2/test.py
index 94c09150ca..5466f960b9 100644
--- a/deep_speech_2/test.py
+++ b/deep_speech_2/test.py
@@ -69,7 +69,8 @@ def evaluate():
         mean_std_filepath=args.mean_std_path,
         augmentation_config='{}',
         specgram_type=args.specgram_type,
-        num_threads=args.num_proc_data)
+        num_threads=args.num_proc_data,
+        keep_transcription_text=True)
     batch_reader = data_generator.batch_reader_creator(
         manifest_path=args.test_manifest,
         batch_size=args.batch_size,
@@ -104,8 +105,7 @@ def evaluate():
             language_model_path=args.lang_model_path,
             num_processes=args.num_proc_bsearch)
         target_transcripts = [
-            ''.join([data_generator.vocab_list[token] for token in transcript])
-            for _, transcript in infer_data
+            transcript for _, transcript in infer_data
         ]
         for target, result in zip(target_transcripts, result_transcripts):
             error_sum += error_rate_func(target, result)
diff --git a/deep_speech_2/tools/tune.py b/deep_speech_2/tools/tune.py
index 233ec4ab84..99ffb5f5d1 100644
--- a/deep_speech_2/tools/tune.py
+++ b/deep_speech_2/tools/tune.py
@@ -87,7 +87,8 @@ def tune():
         mean_std_filepath=args.mean_std_path,
         augmentation_config='{}',
         specgram_type=args.specgram_type,
-        num_threads=args.num_proc_data)
+        num_threads=args.num_proc_data,
+        keep_transcription_text=True)
 
     audio_data = paddle.layer.data(
         name="audio_spectrogram",
@@ -164,8 +165,7 @@ def tune():
         ]
 
         target_transcripts = [
-            ''.join([data_generator.vocab_list[token] for token in transcript])
-            for _, transcript in infer_data
+            transcript for _, transcript in infer_data
         ]
 
         num_ins += len(target_transcripts)

From 081789bc0aeb3887df9d5486a93a1589f23daad8 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Fri, 3 Nov 2017 14:52:10 +0800
Subject: [PATCH 2/2] Add doc and adjust some codes.

---
 deep_speech_2/data_utils/data.py | 4 ++--
 deep_speech_2/infer.py           | 4 +---
 deep_speech_2/test.py            | 4 +---
 deep_speech_2/tools/tune.py      | 4 +---
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py
index edd4047ef3..70ee6fbad9 100644
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
@@ -103,8 +103,8 @@ def process_utterance(self, filename, transcript):
         :type filename: basestring | file
         :param transcript: Transcription text.
         :type transcript: basestring
-        :return: Tuple of audio feature tensor and list of token ids for
-                 transcription.
+        :return: Tuple of audio feature tensor and data of transcription part,
+                 where transcription part could be token ids or text.
         :rtype: tuple of (2darray, list)
         """
         if filename.startswith('tar:'):
diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py
index 74524602aa..9ac3e632ef 100644
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -103,9 +103,7 @@ def infer():
         num_processes=args.num_proc_bsearch)
 
     error_rate_func = cer if args.error_rate_type == 'cer' else wer
-    target_transcripts = [
-        transcript for _, transcript in infer_data
-    ]
+    target_transcripts = [transcript for _, transcript in infer_data]
     for target, result in zip(target_transcripts, result_transcripts):
         print("\nTarget Transcription: %s\nOutput Transcription: %s" %
               (target, result))
diff --git a/deep_speech_2/test.py b/deep_speech_2/test.py
index 5466f960b9..63fc4f65c9 100644
--- a/deep_speech_2/test.py
+++ b/deep_speech_2/test.py
@@ -104,9 +104,7 @@ def evaluate():
             vocab_list=vocab_list,
             language_model_path=args.lang_model_path,
             num_processes=args.num_proc_bsearch)
-        target_transcripts = [
-            transcript for _, transcript in infer_data
-        ]
+        target_transcripts = [transcript for _, transcript in infer_data]
         for target, result in zip(target_transcripts, result_transcripts):
             error_sum += error_rate_func(target, result)
             num_ins += 1
diff --git a/deep_speech_2/tools/tune.py b/deep_speech_2/tools/tune.py
index 99ffb5f5d1..966029a825 100644
--- a/deep_speech_2/tools/tune.py
+++ b/deep_speech_2/tools/tune.py
@@ -164,9 +164,7 @@ def tune():
             for i in xrange(len(infer_data))
         ]
 
-        target_transcripts = [
-            transcript for _, transcript in infer_data
-        ]
+        target_transcripts = [transcript for _, transcript in infer_data]
 
         num_ins += len(target_transcripts)
         # grid search