From d2c52dbc10277229377e9b56b11c75d2afdfa584 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 23 Oct 2017 17:47:22 +0800 Subject: [PATCH 1/2] Give option to disable converting from transcription text to ids. --- deep_speech_2/data_utils/data.py | 13 ++++++++++--- .../data_utils/featurizer/speech_featurizer.py | 8 +++++--- deep_speech_2/deploy/demo_server.py | 3 ++- deep_speech_2/infer.py | 6 +++--- deep_speech_2/test.py | 6 +++--- deep_speech_2/tools/tune.py | 6 +++--- 6 files changed, 26 insertions(+), 16 deletions(-) diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py index 71ba2434f2..edd4047ef3 100644 --- a/deep_speech_2/data_utils/data.py +++ b/deep_speech_2/data_utils/data.py @@ -55,6 +55,10 @@ class DataGenerator(object): :type num_threads: int :param random_seed: Random seed. :type random_seed: int + :param keep_transcription_text: If set to True, transcription text will + be passed forward directly without + converting to index sequence. + :type keep_transcription_text: bool """ def __init__(self, @@ -69,7 +73,8 @@ def __init__(self, specgram_type='linear', use_dB_normalization=True, num_threads=multiprocessing.cpu_count() // 2, - random_seed=0): + random_seed=0, + keep_transcription_text=False): self._max_duration = max_duration self._min_duration = min_duration self._normalizer = FeatureNormalizer(mean_std_filepath) @@ -84,6 +89,7 @@ def __init__(self, use_dB_normalization=use_dB_normalization) self._num_threads = num_threads self._rng = random.Random(random_seed) + self._keep_transcription_text = keep_transcription_text self._epoch = 0 # for caching tar files info self._local_data = local() @@ -107,9 +113,10 @@ def process_utterance(self, filename, transcript): else: speech_segment = SpeechSegment.from_file(filename, transcript) self._augmentation_pipeline.transform_audio(speech_segment) - specgram, text_ids = self._speech_featurizer.featurize(speech_segment) + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) - return specgram, text_ids + return specgram, transcript_part def batch_reader_creator(self, manifest_path, diff --git a/deep_speech_2/data_utils/featurizer/speech_featurizer.py b/deep_speech_2/data_utils/featurizer/speech_featurizer.py index a947588db4..4555dc31da 100644 --- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py +++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py @@ -60,12 +60,12 @@ def __init__(self, target_dB=target_dB) self._text_featurizer = TextFeaturizer(vocab_filepath) - def featurize(self, speech_segment): + def featurize(self, speech_segment, keep_transcription_text): """Extract features for speech segment. 1. For audio parts, extract the audio features. - 2. For transcript parts, convert text string to a list of token indices - in char-level. + 2. For transcript parts, keep the original text or convert text string + to a list of token indices in char-level. :param audio_segment: Speech segment to extract features from. :type audio_segment: SpeechSegment @@ -74,6 +74,8 @@ def featurize(self, speech_segment): :rtype: tuple """ audio_feature = self._audio_featurizer.featurize(speech_segment) + if keep_transcription_text: + return audio_feature, speech_segment.transcript text_ids = self._text_featurizer.featurize(speech_segment.transcript) return audio_feature, text_ids diff --git a/deep_speech_2/deploy/demo_server.py b/deep_speech_2/deploy/demo_server.py index b007c751e7..3e81c0c5b9 100644 --- a/deep_speech_2/deploy/demo_server.py +++ b/deep_speech_2/deploy/demo_server.py @@ -146,7 +146,8 @@ def start_server(): mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=1) + num_threads=1, + keep_transcription_text=True) # prepare ASR model ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index a30d48d6de..74524602aa 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -68,7 +68,8 @@ def infer(): mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=1) + num_threads=1, + keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.num_samples, @@ -103,8 +104,7 @@ def infer(): error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ - ''.join([data_generator.vocab_list[token] for token in transcript]) - for _, transcript in infer_data + transcript for _, transcript in infer_data ] for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % diff --git a/deep_speech_2/test.py b/deep_speech_2/test.py index 94c09150ca..5466f960b9 100644 --- a/deep_speech_2/test.py +++ b/deep_speech_2/test.py @@ -69,7 +69,8 @@ def evaluate(): mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_proc_data) + num_threads=args.num_proc_data, + keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, @@ -104,8 +105,7 @@ def evaluate(): language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) target_transcripts = [ - ''.join([data_generator.vocab_list[token] for token in transcript]) - for _, transcript in infer_data + transcript for _, transcript in infer_data ] for target, result in zip(target_transcripts, result_transcripts): error_sum += error_rate_func(target, result) diff --git a/deep_speech_2/tools/tune.py b/deep_speech_2/tools/tune.py index 233ec4ab84..99ffb5f5d1 100644 --- a/deep_speech_2/tools/tune.py +++ b/deep_speech_2/tools/tune.py @@ -87,7 +87,8 @@ def tune(): mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_proc_data) + num_threads=args.num_proc_data, + keep_transcription_text=True) audio_data = paddle.layer.data( name="audio_spectrogram", @@ -164,8 +165,7 @@ def tune(): ] target_transcripts = [ - ''.join([data_generator.vocab_list[token] for token in transcript]) - for _, transcript in infer_data + transcript for _, transcript in infer_data ] num_ins += len(target_transcripts) From 081789bc0aeb3887df9d5486a93a1589f23daad8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 3 Nov 2017 14:52:10 +0800 Subject: [PATCH 2/2] Add doc and adjust some codes. --- deep_speech_2/data_utils/data.py | 4 ++-- deep_speech_2/infer.py | 4 +--- deep_speech_2/test.py | 4 +--- deep_speech_2/tools/tune.py | 4 +--- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py index edd4047ef3..70ee6fbad9 100644 --- a/deep_speech_2/data_utils/data.py +++ b/deep_speech_2/data_utils/data.py @@ -103,8 +103,8 @@ def process_utterance(self, filename, transcript): :type filename: basestring | file :param transcript: Transcription text. :type transcript: basestring - :return: Tuple of audio feature tensor and list of token ids for - transcription. + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ if filename.startswith('tar:'): diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 74524602aa..9ac3e632ef 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -103,9 +103,7 @@ def infer(): num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer - target_transcripts = [ - transcript for _, transcript in infer_data - ] + target_transcripts = [transcript for _, transcript in infer_data] for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % (target, result)) diff --git a/deep_speech_2/test.py b/deep_speech_2/test.py index 5466f960b9..63fc4f65c9 100644 --- a/deep_speech_2/test.py +++ b/deep_speech_2/test.py @@ -104,9 +104,7 @@ def evaluate(): vocab_list=vocab_list, language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) - target_transcripts = [ - transcript for _, transcript in infer_data - ] + target_transcripts = [transcript for _, transcript in infer_data] for target, result in zip(target_transcripts, result_transcripts): error_sum += error_rate_func(target, result) num_ins += 1 diff --git a/deep_speech_2/tools/tune.py b/deep_speech_2/tools/tune.py index 99ffb5f5d1..966029a825 100644 --- a/deep_speech_2/tools/tune.py +++ b/deep_speech_2/tools/tune.py @@ -164,9 +164,7 @@ def tune(): for i in xrange(len(infer_data)) ] - target_transcripts = [ - transcript for _, transcript in infer_data - ] + target_transcripts = [transcript for _, transcript in infer_data] num_ins += len(target_transcripts) # grid search