diff --git a/examples/microphone-speech-to-text.py b/examples/microphone-speech-to-text.py index fb0fbd1a..9174de74 100644 --- a/examples/microphone-speech-to-text.py +++ b/examples/microphone-speech-to-text.py @@ -72,7 +72,8 @@ def recognize_using_weboscket(*args): mycallback = MyRecognizeCallback() speech_to_text.recognize_using_websocket(audio=audio_source, content_type='audio/l16; rate=44100', - recognize_callback=mycallback) + recognize_callback=mycallback, + interim_results=True) ############################################### #### Prepare the for recording using Pyaudio ## diff --git a/ibm_watson/speech_to_text_v1_adapter.py b/ibm_watson/speech_to_text_v1_adapter.py index 67820be8..dabe6526 100644 --- a/ibm_watson/speech_to_text_v1_adapter.py +++ b/ibm_watson/speech_to_text_v1_adapter.py @@ -33,6 +33,7 @@ def recognize_using_websocket(self, customization_weight=None, base_model_version=None, inactivity_timeout=None, + interim_results=None, keywords=None, keywords_threshold=None, max_alternatives=None, @@ -54,6 +55,7 @@ def recognize_using_websocket(self, split_transcript_at_phrase_end=None, speech_detector_sensitivity=None, background_audio_suppression=None, + low_latency=None, character_insertion_bias=None, **kwargs): """ @@ -269,6 +271,22 @@ def recognize_using_websocket(self, * 1.0 suppresses all audio (no audio is transcribed). The values increase on a monotonic curve. See [Background audio suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). + :param bool low_latency: (optional) If `true` for next-generation + `Multimedia` and `Telephony` models that support low latency, directs the + service to produce results even more quickly than it usually does. + Next-generation models produce transcription results faster than + previous-generation models. The `low_latency` parameter causes the models + to produce results even more quickly, though the results might be less + accurate when the parameter is used. + **Note:** The parameter is beta functionality. It is not available for + previous-generation `Broadband` and `Narrowband` models. It is available + only for some next-generation models. + * For a list of next-generation models that support low latency, see + [Supported language + models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) + for next-generation models. + * For more information about the `low_latency` parameter, see [Low + latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). :param float character_insertion_bias: (optional) For next-generation `Multimedia` and `Telephony` models, an indication of whether the service is biased to recognize shorter or longer strings of characters when @@ -337,6 +355,7 @@ def recognize_using_websocket(self, 'customization_weight': customization_weight, 'content_type': content_type, 'inactivity_timeout': inactivity_timeout, + 'interim_results': interim_results, 'keywords': keywords, 'keywords_threshold': keywords_threshold, 'max_alternatives': max_alternatives, @@ -356,7 +375,8 @@ def recognize_using_websocket(self, 'split_transcript_at_phrase_end': split_transcript_at_phrase_end, 'speech_detector_sensitivity': speech_detector_sensitivity, 'background_audio_suppression': background_audio_suppression, - 'character_insertion_bias': character_insertion_bias + 'character_insertion_bias': character_insertion_bias, + 'low_latency': low_latency, } options = {k: v for k, v in options.items() if v is not None} request['options'] = options diff --git a/ibm_watson/websocket/recognize_listener.py b/ibm_watson/websocket/recognize_listener.py index 041bcf69..43eb7961 100644 --- a/ibm_watson/websocket/recognize_listener.py +++ b/ibm_watson/websocket/recognize_listener.py @@ -196,15 +196,16 @@ def on_data(self, ws, message, message_type, fin): # set of transcriptions and send them to the appropriate callbacks. results = json_object.get('results') if results: - b_final = (results[0].get('final') is True) - alternatives = results[0].get('alternatives') - if alternatives: - hypothesis = alternatives[0].get('transcript') - transcripts = self.extract_transcripts(alternatives) - if b_final: - self.callback.on_transcription(transcripts) - if hypothesis: - self.callback.on_hypothesis(hypothesis) + if (self.options.get('interim_results') is True): + b_final = (results[0].get('final') is True) + alternatives = results[0].get('alternatives') + if alternatives: + hypothesis = alternatives[0].get('transcript') + transcripts = self.extract_transcripts(alternatives) + if b_final: + self.callback.on_transcription(transcripts) + if hypothesis: + self.callback.on_hypothesis(hypothesis) else: final_transcript = [] for result in results: diff --git a/test/integration/test_speech_to_text_v1.py b/test/integration/test_speech_to_text_v1.py index 1d41df96..4defbea1 100644 --- a/test/integration/test_speech_to_text_v1.py +++ b/test/integration/test_speech_to_text_v1.py @@ -118,6 +118,66 @@ def on_data(self, data): assert test_callback.data['results'][0]['alternatives'][0] ['transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain ' + def test_on_transcription_interim_results_false(self): + class MyRecognizeCallback(RecognizeCallback): + def __init__(self): + RecognizeCallback.__init__(self) + self.error = None + self.transcript = None + def on_error(self, error): + self.error = error + def on_transcription(self, transcript): + self.transcript = transcript + test_callback = MyRecognizeCallback() + with open(os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file: + audio_source = AudioSource(audio_file, False) + self.speech_to_text.recognize_using_websocket(audio_source, "audio/wav", test_callback, model="en-US_Telephony", + interim_results=False, low_latency=False) + assert test_callback.error is None + assert test_callback.transcript is not None + assert test_callback.transcript[0][0]['transcript'] in ['isolated tornadoes ', 'isolated tornados '] + assert test_callback.transcript[1][0]['transcript'] == 'and heavy rain ' + def test_on_transcription_interim_results_true(self): + class MyRecognizeCallback(RecognizeCallback): + def __init__(self): + RecognizeCallback.__init__(self) + self.error = None + self.transcript = None + def on_error(self, error): + self.error = error + def on_transcription(self, transcript): + self.transcript = transcript + assert transcript[0]['confidence'] is not None + assert transcript[0]['transcript'] is not None + test_callback = MyRecognizeCallback() + with open(os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file: + audio_source = AudioSource(audio_file, False) + self.speech_to_text.recognize_using_websocket(audio_source, "audio/wav", test_callback, model="en-US_Telephony", + interim_results=True, low_latency=True) + assert test_callback.error is None + assert test_callback.transcript is not None + assert test_callback.transcript[0]['transcript'] == 'and heavy rain ' + def test_on_transcription_interim_results_true_low_latency_false(self): + class MyRecognizeCallback(RecognizeCallback): + def __init__(self): + RecognizeCallback.__init__(self) + self.error = None + self.transcript = None + def on_error(self, error): + self.error = error + def on_transcription(self, transcript): + self.transcript = transcript + assert transcript[0]['confidence'] is not None + assert transcript[0]['transcript'] is not None + test_callback = MyRecognizeCallback() + with open(os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file: + audio_source = AudioSource(audio_file, False) + self.speech_to_text.recognize_using_websocket(audio_source, "audio/wav", test_callback, model="en-US_Telephony", + interim_results=True, low_latency=False) + assert test_callback.error is None + assert test_callback.transcript is not None + assert test_callback.transcript[0]['transcript'] == 'and heavy rain ' + def test_custom_grammars(self): customization_id = None for custom_model in self.custom_models.get('customizations'):