diff --git a/speech/cloud_speech.proto b/speech/cloud_speech.proto new file mode 100644 index 00000000000..c55ec0d5cb1 --- /dev/null +++ b/speech/cloud_speech.proto @@ -0,0 +1,321 @@ +syntax = "proto3"; + +package google.cloud.speech.v1beta1; + +option java_multiple_files = true; +option java_outer_classname = "SpeechProto"; +option java_package = "com.google.cloud.speech.v1beta1"; + +import "google/api/annotations.proto"; +import "google/longrunning/operations.proto"; +import "google/rpc/status.proto"; + + +// Service that implements Google Cloud Speech API. +service Speech { + // Perform synchronous speech-recognition: receive results after all audio + // has been sent and processed. + rpc SyncRecognize(SyncRecognizeRequest) returns (SyncRecognizeResponse) { + option (google.api.http) = + { post: "/v1beta1/speech:syncrecognize" body: "*" }; + } + + // Perform asynchronous speech-recognition: receive results via the + // google.longrunning.Operations interface. `Operation.response` returns + // `AsyncRecognizeResponse`. + rpc AsyncRecognize(AsyncRecognizeRequest) + returns (google.longrunning.Operation) { + option (google.api.http) = + { post: "/v1beta1/speech:asyncrecognize" body: "*" }; + } + + // Perform bidirectional streaming speech-recognition: receive results while + // sending audio. This method is only available via the gRPC API (not REST). + rpc StreamingRecognize(stream StreamingRecognizeRequest) + returns (stream StreamingRecognizeResponse); +} + +// `SyncRecognizeRequest` is the top-level message sent by the client for +// the `SyncRecognize` method. +message SyncRecognizeRequest { + // [Required] The `config` message provides information to the recognizer + // that specifies how to process the request. + RecognitionConfig config = 1; + + // [Required] The audio data to be recognized. + RecognitionAudio audio = 2; +} + +// `AsyncRecognizeRequest` is the top-level message sent by the client for +// the `AsyncRecognize` method. +message AsyncRecognizeRequest { + // [Required] The `config` message provides information to the recognizer + // that specifies how to process the request. + RecognitionConfig config = 1; + + // [Required] The audio data to be recognized. + RecognitionAudio audio = 2; +} + +// `StreamingRecognizeRequest` is the top-level message sent by the client for +// the `StreamingRecognize`. Multiple `StreamingRecognizeRequest` messages are +// sent. The first message must contain a `streaming_config` message and must +// not contain `audio` data. All subsequent messages must contain `audio` data +// and must not contain a `streaming_config` message. +message StreamingRecognizeRequest { + oneof streaming_request { + // The `streaming_config` message provides information to the recognizer + // that specifies how to process the request. + // + // The first `StreamingRecognizeRequest` message must contain a + // `streaming_config` message. + StreamingRecognitionConfig streaming_config = 1; + + // The audio data to be recognized. Sequential chunks of audio data are sent + // in sequential `StreamingRecognizeRequest` messages. The first + // `StreamingRecognizeRequest` message must not contain `audio_content` data + // and all subsequent `StreamingRecognizeRequest` messages must contain + // `audio_content` data. The audio bytes must be encoded as specified in + // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a + // pure binary representation (not base64). + bytes audio_content = 2 [ctype = CORD]; + } +} + +// The `StreamingRecognitionConfig` message provides information to the +// recognizer that specifies how to process the request. +message StreamingRecognitionConfig { + // [Required] The `config` message provides information to the recognizer + // that specifies how to process the request. + RecognitionConfig config = 1; + + // [Optional] If `false` or omitted, the recognizer will perform continuous + // recognition (continuing to process audio even if the user pauses speaking) + // until the client closes the output stream (gRPC API) or when the maximum + // time limit has been reached. Multiple `SpeechRecognitionResult`s with the + // `is_final` flag set to `true` may be returned. + // + // If `true`, the recognizer will detect a single spoken utterance. When it + // detects that the user has paused or stopped speaking, it will return an + // `END_OF_UTTERANCE` event and cease recognition. It will return no more than + // one `SpeechRecognitionResult` with the `is_final` flag set to `true`. + bool single_utterance = 2; + + // [Optional] If `true`, interim results (tentative hypotheses) may be + // returned as they become available (these interim results are indicated with + // the `is_final=false` flag). + // If `false` or omitted, only `is_final=true` result(s) are returned. + bool interim_results = 3; +} + +// The `RecognitionConfig` message provides information to the recognizer +// that specifies how to process the request. +message RecognitionConfig { + // Audio encoding of the data sent in the audio message. All encodings support + // only 1 channel (mono) audio. Only `FLAC` includes a header that describes + // the bytes of audio that follow the header. The other encodings are raw + // audio bytes with no header. + // + // For best results, the audio source should be captured and transmitted using + // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be + // reduced if lossy codecs (such as AMR, AMR_WB and MULAW) are used to capture + // or transmit the audio, particularly if background noise is present. + enum AudioEncoding { + // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. + ENCODING_UNSPECIFIED = 0; + + // Uncompressed 16-bit signed little-endian samples. + // This is the only encoding that may be used by `AsyncRecognize`. + LINEAR16 = 1; + + // This is the recommended encoding for `SyncRecognize` and + // `StreamingRecognize` because it uses lossless compression; therefore + // recognition accuracy is not compromised by a lossy codec. + // + // The stream FLAC (Free Lossless Audio Codec) encoding is specified at: + // http://flac.sourceforge.net/documentation.html. + // Only 16-bit samples are supported. + // Not all fields in STREAMINFO are supported. + FLAC = 2; + + // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. + MULAW = 3; + + // Adaptive Multi-Rate Narrowband codec. `sample_rate` must be 8000 Hz. + AMR = 4; + + // Adaptive Multi-Rate Wideband codec. `sample_rate` must be 16000 Hz. + AMR_WB = 5; + } + + // [Required] Encoding of audio data sent in all `RecognitionAudio` messages. + AudioEncoding encoding = 1; + + // [Required] Sample rate in Hertz of the audio data sent in all + // `RecognitionAudio` messages. Valid values are: 8000-48000. + // 16000 is optimal. For best results, set the sampling rate of the audio + // source to 16000 Hz. If that's not possible, use the native sample rate of + // the audio source (instead of re-sampling). + int32 sample_rate = 2; + + // [Optional] The language of the supplied audio as a BCP-47 language tag. + // Example: "en-GB" https://www.rfc-editor.org/rfc/bcp/bcp47.txt + // If omitted, defaults to "en-US". See + // [Language Support](/speech/docs/best-practices#language_support) for + // a list of the currently supported language codes. + string language_code = 3; + + // [Optional] Maximum number of recognition hypotheses to be returned. + // Specifically, the maximum number of `SpeechRecognitionAlternative` messages + // within each `SpeechRecognitionResult`. + // The server may return fewer than `max_alternatives`. + // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of + // `1`. If omitted, defaults to `1`. + int32 max_alternatives = 4; + + // [Optional] If set to `true`, the server will attempt to filter out + // profanities, replacing all but the initial character in each filtered word + // with asterisks, e.g. "f***". If set to `false` or omitted, profanities + // won't be filtered out. + bool profanity_filter = 5; + + // [Optional] A means to provide context to assist the speech recognition. + SpeechContext speech_context = 6; +} + +// Provides "hints" to the speech recognizer to favor specific words and phrases +// in the results. +message SpeechContext { + // [Optional] A list of up to 50 phrases of up to 100 characters each to + // provide words and phrases "hints" to the speech recognition so that it is + // more likely to recognize them. + repeated string phrases = 1; +} + +// Contains audio data in the encoding specified in the `RecognitionConfig`. +// Either `content` or `uri` must be supplied. Supplying both or neither +// returns [google.rpc.Code.INVALID_ARGUMENT][]. +message RecognitionAudio { + oneof audio_source { + // The audio data bytes encoded as specified in + // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a + // pure binary representation, whereas JSON representations use base64. + bytes content = 1 [ctype = CORD]; + + // URI that points to a file that contains audio data bytes as specified in + // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are + // supported, which must be specified in the following format: + // `gs://bucket_name/object_name` (other URI formats return + // [google.rpc.Code.INVALID_ARGUMENT][]). For more information, see + // [Request URIs](/storage/docs/reference-uris). + string uri = 2; + } +} + +// `SyncRecognizeResponse` is the only message returned to the client by +// `SyncRecognize`. It contains the result as zero or more +// sequential `RecognizeResponse` messages. +message SyncRecognizeResponse { + // [Output-only] Sequential list of transcription results corresponding to + // sequential portions of audio. + repeated SpeechRecognitionResult results = 2; +} + +// `AsyncRecognizeResponse` is the only message returned to the client by +// `AsyncRecognize`. It contains the result as zero or more +// sequential `RecognizeResponse` messages. +message AsyncRecognizeResponse { + // [Output-only] Sequential list of transcription results corresponding to + // sequential portions of audio. + repeated SpeechRecognitionResult results = 2; +} + +// `StreamingRecognizeResponse` is the only message returned to the client by +// `StreamingRecognize`. It contains the result as zero or more +// sequential `RecognizeResponse` messages. +message StreamingRecognizeResponse { + // Indicates the type of endpointer event. + enum EndpointerType { + // No endpointer event specified. + ENDPOINTER_EVENT_UNSPECIFIED = 0; + + // Speech has been detected in the audio stream. + START_OF_SPEECH = 1; + + // Speech has ceased to be detected in the audio stream. + END_OF_SPEECH = 2; + + // The end of the audio stream has been reached. and it is being processed. + END_OF_AUDIO = 3; + + // This event is only sent when `single_utterance` is `true`. It indicates + // that the server has detected the end of the user's speech utterance and + // expects no additional speech. Therefore, the server will not process + // additional audio. The client should stop sending additional audio data. + END_OF_UTTERANCE = 4; + } + + // [Output-only] If set, returns a [google.rpc.Status][] message that + // specifies the error for the operation. + google.rpc.Status error = 1; + + // [Output-only] This repeated list contains zero or more results that + // correspond to consecutive portions of the audio currently being processed. + // It contains zero or one `is_final=true` result (the newly settled portion), + // followed by zero or more `is_final=false` results. + repeated StreamingRecognitionResult results = 2; + + // [Output-only] Indicates the lowest index in the `results` array that has + // changed. The repeated `SpeechRecognitionResult` results overwrite past + // results at this index and higher. + int32 result_index = 3; + + // [Output-only] Indicates the type of endpointer event. + EndpointerType endpointer_type = 4; +} + +// A speech recognition result corresponding to a portion of the audio that is +// currently being processed. +// TODO(gshires): add a comment describing the various repeated interim and +// alternative results fields. +message StreamingRecognitionResult { + // [Output-only] May contain one or more recognition hypotheses (up to the + // maximum specified in `max_alternatives`). + repeated SpeechRecognitionAlternative alternatives = 1; + + // [Output-only] If `false`, this `SpeechRecognitionResult` represents an + // interim result that may change. If `true`, this is the final time the + // speech service will return this particular `SpeechRecognitionResult`, + // the recognizer will not return any further hypotheses for this portion of + // the transcript and corresponding audio. + bool is_final = 2; + + // [Output-only] An estimate of the probability that the recognizer will not + // change its guess about this interim result. Values range from 0.0 + // (completely unstable) to 1.0 (completely stable). Note that this is not the + // same as `confidence`, which estimates the probability that a recognition + // result is correct. + // This field is only provided for interim results (`is_final=false`). + // The default of 0.0 is a sentinel value indicating stability was not set. + float stability = 3; +} + +// A speech recognition result corresponding to a portion of the audio. +message SpeechRecognitionResult { + // [Output-only] May contain one or more recognition hypotheses (up to the + // maximum specified in `max_alternatives`). + repeated SpeechRecognitionAlternative alternatives = 1; +} + +// Alternative hypotheses (a.k.a. n-best list). +message SpeechRecognitionAlternative { + // [Output-only] Transcript text representing the words that the user spoke. + string transcript = 1; + + // [Output-only] The confidence estimate between 0.0 and 1.0. A higher number + // means the system is more confident that the recognition is correct. + // This field is typically provided only for the top hypothesis, and only for + // `is_final=true` results. + // The default of 0.0 is a sentinel value indicating confidence was not set. + float confidence = 2; +} \ No newline at end of file diff --git a/speech/recognize.js b/speech/recognize.js index 86d513f4813..08ec7c1183f 100644 --- a/speech/recognize.js +++ b/speech/recognize.js @@ -20,13 +20,8 @@ var async = require('async'); var fs = require('fs'); // [END import_libraries] -// Url to discovery doc file -// [START discovery_doc] -var url = 'https://speech.googleapis.com/$discovery/rest'; -// [END discovery_doc] - // [START authenticating] -function getSpeechService (callback) { +function getSpeechService (host, callback) { // Acquire credentials google.auth.getApplicationDefault(function (err, authClient) { if (err) { @@ -49,6 +44,13 @@ function getSpeechService (callback) { // Load the speach service using acquired credentials console.log('Loading speech service...'); + + // Url to discovery doc file + // [START discovery_doc] + host = host || 'speech.googleapis.com' + var url = 'https://' + host + '/$discovery/rest'; + // [END discovery_doc] + google.discoverAPI({ url: url, version: 'v1', @@ -85,7 +87,7 @@ function prepareRequest (inputFile, callback) { } // [END construct_request] -function main (inputFile, callback) { +function main (inputFile, host, callback) { var requestPayload; async.waterfall([ @@ -94,7 +96,7 @@ function main (inputFile, callback) { }, function (payload, cb) { requestPayload = payload; - getSpeechService(cb); + getSpeechService(host, cb); }, // [START send_request] function sendRequest (speechService, authClient, cb) { @@ -117,11 +119,12 @@ function main (inputFile, callback) { // [START run_application] if (module === require.main) { if (process.argv.length < 3) { - console.log('Usage: node recognize '); + console.log('Usage: node recognize [speech_api_host]'); process.exit(); } var inputFile = process.argv[2]; - main(inputFile, console.log); + var host = process.argv[3]; + main(inputFile, host || 'speech.googleapis.com', console.log); } // [END run_application] // [END app] diff --git a/speech/recognize_streaming.js b/speech/recognize_streaming.js index 9bf67f6350c..459e784006c 100644 --- a/speech/recognize_streaming.js +++ b/speech/recognize_streaming.js @@ -19,10 +19,11 @@ var path = require('path'); var grpc = require('grpc'); var googleProtoFiles = require('google-proto-files'); var googleAuth = require('google-auto-auth'); +var Transform = require('stream').Transform; // [START proto] var PROTO_ROOT_DIR = googleProtoFiles('..'); -var PROTO_FILE_PATH = googleProtoFiles('cloud', 'speech', 'v1', 'cloud_speech.proto'); +var PROTO_FILE_PATH = path.join(__dirname, 'cloud_speech.proto'); var protoDescriptor = grpc.load({ root: PROTO_ROOT_DIR, @@ -31,11 +32,11 @@ var protoDescriptor = grpc.load({ binaryAsBase64: true, convertFieldsToCamelCase: true }); -var speechProto = protoDescriptor.google.cloud.speech.v1; +var speechProto = protoDescriptor.google.cloud.speech.v1beta1; // [END proto] // [START authenticating] -function getSpeechService (callback) { +function getSpeechService (host, callback) { var googleAuthClient = googleAuth({ scopes: [ 'https://www.googleapis.com/auth/cloud-platform' @@ -53,46 +54,31 @@ function getSpeechService (callback) { ); console.log('Loading speech service...'); - var stub = new speechProto.Speech('speech.googleapis.com', credentials); + var stub = new speechProto.Speech(host, credentials); return callback(null, stub); }); } // [END authenticating] -// [START construct_request] -function getAudioFile (inputFile, callback) { - fs.readFile(inputFile, function (err, audioFile) { - if (err) { - return callback(err); - } - console.log('Got audio file!'); - return callback(null, audioFile); - }); -} -// [END construct_request] - -function main (inputFile, callback) { - var audioFile; - +function main (inputFile, host, callback) { async.waterfall([ function (cb) { - getAudioFile(inputFile, cb); - }, - function (_audioFile, cb) { - audioFile = _audioFile; - getSpeechService(cb); + getSpeechService(host, cb); }, // [START send_request] function sendRequest (speechService, cb) { console.log('Analyzing speech...'); var responses = []; - var call = speechService.recognize(); + var call = speechService.streamingRecognize(); // Listen for various responses call.on('error', cb); call.on('data', function (recognizeResponse) { if (recognizeResponse) { responses.push(recognizeResponse); + if (recognizeResponse.results && recognizeResponse.results.length) { + console.log(JSON.stringify(recognizeResponse.results, null, 2)); + } } }); call.on('end', function () { @@ -100,25 +86,28 @@ function main (inputFile, callback) { }); // Write the initial recognize reqeust - call.write(new speechProto.RecognizeRequest({ - initialRequest: new speechProto.InitialRecognizeRequest({ - encoding: 'LINEAR16', - sampleRate: 16000, + call.write({ + streamingConfig: { + config: { + encoding: 'LINEAR16', + sampleRate: 16000 + }, interimResults: false, - continuous: false, - enableEndpointerEvents: false - }) - })); - - // Write an audio request - call.write(new speechProto.RecognizeRequest({ - audioRequest: new speechProto.AudioRequest({ - content: audioFile - }) - })); + singleUtterance: false + } + }); - // Signal that we're done writing - call.end(); + var toRecognizeRequest = new Transform({ objectMode: true }); + toRecognizeRequest._transform = function (chunk, encoding, done) { + done(null, { + audioContent: chunk + }); + }; + + // Stream the audio to the Speech API + fs.createReadStream(inputFile) + .pipe(toRecognizeRequest) + .pipe(call); } // [END send_request] ], callback); @@ -127,11 +116,12 @@ function main (inputFile, callback) { // [START run_application] if (module === require.main) { if (process.argv.length < 3) { - console.log('Usage: node recognize_streaming '); + console.log('Usage: node recognize_streaming [speech_api_host]'); process.exit(); } var inputFile = process.argv[2]; - main(inputFile, console.log); + var host = process.argv[3]; + main(inputFile, host || 'speech.googleapis.com', console.log); } // [END run_application] diff --git a/test/speech/recognize_streaming.test.js b/test/speech/recognize_streaming.test.js index f3dff00f52e..0ae242210f2 100644 --- a/test/speech/recognize_streaming.test.js +++ b/test/speech/recognize_streaming.test.js @@ -17,15 +17,18 @@ var path = require('path'); var recognizeExample = require('../../speech/recognize_streaming'); describe('speech:recognize_streaming', function () { - it('should list entries', function (done) { + it('should recognize audio', function (done) { recognizeExample.main( path.join(__dirname, '../../speech/resources/audio.raw'), + process.env.SPEECH_API_HOST || 'speech.googleapis.com', function (err, results) { assert(!err); assert(results); - assert(results.length === 1); + assert(results.length === 3); assert(results[0].results); - assert(console.log.calledWith('Got audio file!')); + assert(results[1].results); + assert(results[2].results); + assert(results[2].results.length === 1); assert(console.log.calledWith('Loading speech service...')); assert(console.log.calledWith('Analyzing speech...')); done();