Merge pull request #2644 from daspecster/speech-streaming-part-3

Add _stream_requests() for managing speech streaming configuration
googleapis · Oct 31, 2016 · 3b3c9ce · 3b3c9ce
2 parents 8b22482 + 702f0bd
commit 3b3c9ce
Show file tree

Hide file tree

Showing 3 changed files with 146 additions and 2 deletions.
diff --git a/packages/google-cloud-speech/google/cloud/speech/_gax.py b/packages/google-cloud-speech/google/cloud/speech/_gax.py
@@ -145,6 +145,82 @@ def sync_recognize(self, sample, language_code=None, max_alternatives=None,
             raise ValueError('More than one result or none returned from API.')
 
 
+def _stream_requests(sample, language_code=None, max_alternatives=None,
+                     profanity_filter=None, speech_context=None,
+                     single_utterance=None, interim_results=None):
+    """Generate stream of requests from sample.
+
+    :type sample: :class:`~google.cloud.speech.sample.Sample`
+    :param sample: Instance of ``Sample`` containing audio information.
+
+    :type language_code: str
+    :param language_code: (Optional) The language of the supplied audio as
+                          BCP-47 language tag. Example: ``'en-GB'``.
+                          If omitted, defaults to ``'en-US'``.
+
+    :type max_alternatives: int
+    :param max_alternatives: (Optional) Maximum number of recognition
+                             hypotheses to be returned. The server may
+                             return fewer than maxAlternatives.
+                             Valid values are 0-30. A value of 0 or 1
+                             will return a maximum of 1. Defaults to 1
+
+    :type profanity_filter: bool
+    :param profanity_filter: (Optional) If True, the server will attempt to
+                             filter out profanities, replacing all but the
+                             initial character in each filtered word with
+                             asterisks, e.g. ``'f***'``. If False or
+                             omitted, profanities won't be filtered out.
+
+    :type speech_context: list
+    :param speech_context: (Optional) A list of strings (max 50) containing
+                           words and phrases "hints" so that the speech
+                           recognition is more likely to recognize them.
+                           This can be used to improve the accuracy for
+                           specific words and phrases. This can also be used to
+                           add new words to the vocabulary of the recognizer.
+
+    :type single_utterance: bool
+    :param single_utterance: (Optional) If false or omitted, the recognizer
+                             will perform continuous recognition
+                             (continuing to process audio even if the user
+                             pauses speaking) until the client closes the
+                             output stream (gRPC API) or when the maximum
+                             time limit has been reached. Multiple
+                             SpeechRecognitionResults with the is_final
+                             flag set to true may be returned.
+
+                             If true, the recognizer will detect a single
+                             spoken utterance. When it detects that the
+                             user has paused or stopped speaking, it will
+                             return an END_OF_UTTERANCE event and cease
+                             recognition. It will return no more than one
+                             SpeechRecognitionResult with the is_final flag
+                             set to true.
+
+    :type interim_results: bool
+    :param interim_results: (Optional) If true, interim results (tentative
+                            hypotheses) may be returned as they become
+                            available (these interim results are indicated
+                            with the is_final=false flag). If false or
+                            omitted, only is_final=true result(s) are
+                            returned.
+    """
+    config_request = _make_streaming_request(
+        sample, language_code=language_code, max_alternatives=max_alternatives,
+        profanity_filter=profanity_filter, speech_context=speech_context,
+        single_utterance=single_utterance, interim_results=interim_results)
+
+    # The config request MUST go first and not contain any audio data.
+    yield config_request
+
+    while True:
+        data = sample.content.read(sample.chunk_size)
+        if not data:
+            break
+        yield StreamingRecognizeRequest(audio_content=data)
+
+
 def _make_streaming_request(sample, language_code,
                             max_alternatives, profanity_filter,
                             speech_context, single_utterance,

diff --git a/packages/google-cloud-speech/google/cloud/speech/sample.py b/packages/google-cloud-speech/google/cloud/speech/sample.py
@@ -68,6 +68,15 @@ def __init__(self, content=None, source_uri=None,
         else:
             raise ValueError('Invalid encoding: %s' % (encoding,))
 
+    @property
+    def chunk_size(self):
+        """Chunk size to send over gRPC. ~100ms
+
+        :rtype: int
+        :returns: Optimized chunk size.
+        """
+        return int(self.sample_rate / 10.0)
+
     @property
     def source_uri(self):
         """Google Cloud Storage URI of audio source.

diff --git a/packages/google-cloud-speech/unit_tests/test__gax.py b/packages/google-cloud-speech/unit_tests/test__gax.py
@@ -15,10 +15,10 @@
 import unittest
 
 
-class TestSpeechGAX(unittest.TestCase):
+class TestSpeechGAXMakeRequests(unittest.TestCase):
     SAMPLE_RATE = 16000
     HINTS = ['hi']
-    AUDIO_CONTENT = '/9j/4QNURXhpZgAASUkq'
+    AUDIO_CONTENT = b'/9j/4QNURXhpZgAASUkq'
 
     def _callFUT(self, sample, language_code, max_alternatives,
                  profanity_filter, speech_context, single_utterance,
@@ -78,3 +78,62 @@ def test_ctor(self):
         self.assertEqual(config.max_alternatives, max_alternatives)
         self.assertTrue(config.profanity_filter)
         self.assertEqual(config.speech_context.phrases, self.HINTS)
+
+
+class TestSpeechGAXMakeRequestsStream(unittest.TestCase):
+    SAMPLE_RATE = 16000
+    HINTS = ['hi']
+    AUDIO_CONTENT = b'/9j/4QNURXhpZgAASUkq'
+
+    def _callFUT(self, sample, language_code, max_alternatives,
+                 profanity_filter, speech_context, single_utterance,
+                 interim_results):
+        from google.cloud.speech._gax import _stream_requests
+        return _stream_requests(sample=sample,
+                                language_code=language_code,
+                                max_alternatives=max_alternatives,
+                                profanity_filter=profanity_filter,
+                                speech_context=speech_context,
+                                single_utterance=single_utterance,
+                                interim_results=interim_results)
+
+    def test_stream_requests(self):
+        from io import BytesIO
+        from google.cloud import speech
+        from google.cloud.speech.sample import Sample
+        from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import (
+            SpeechContext)
+        from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import (
+            StreamingRecognitionConfig)
+        from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import (
+            StreamingRecognizeRequest)
+
+        sample = Sample(content=BytesIO(self.AUDIO_CONTENT),
+                        encoding=speech.Encoding.FLAC,
+                        sample_rate=self.SAMPLE_RATE)
+        language_code = 'US-en'
+        max_alternatives = 2
+        profanity_filter = True
+        speech_context = SpeechContext(phrases=self.HINTS)
+        single_utterance = True
+        interim_results = False
+        streaming_requests = self._callFUT(sample, language_code,
+                                           max_alternatives, profanity_filter,
+                                           speech_context, single_utterance,
+                                           interim_results)
+        all_requests = []
+        for streaming_request in streaming_requests:
+            self.assertIsInstance(streaming_request, StreamingRecognizeRequest)
+            all_requests.append(streaming_request)
+
+        self.assertEqual(len(all_requests), 2)
+
+        config_request = all_requests[0]
+        streaming_request = all_requests[1]
+        # This isn't set by _make_streaming_request().
+        # The first request can only have `streaming_config` set.
+        # The following requests can only have `audio_content` set.
+        self.assertEqual(config_request.audio_content, b'')
+        self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT)
+        self.assertIsInstance(config_request.streaming_config,
+                              StreamingRecognitionConfig)