Skip to content

Commit

Permalink
chore: update proto comments and grpc timeouts (#234)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoshi-automation authored and JustinBeckwith committed Nov 11, 2018
1 parent ab45f09 commit 3bb1eb2
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 62 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2018 Google Inc.
// Copyright 2018 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

syntax = "proto3";

Expand All @@ -20,6 +21,7 @@ import "google/api/annotations.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";

Expand Down Expand Up @@ -54,7 +56,8 @@ service Speech {

// Performs bidirectional streaming speech recognition: receive results while
// sending audio. This method is only available via the gRPC API (not REST).
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
}
}

// The top-level message sent by the client for the `Recognize` method.
Expand Down Expand Up @@ -98,7 +101,7 @@ message StreamingRecognizeRequest {
// `audio_content` data. The audio bytes must be encoded as specified in
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
// pure binary representation (not base64). See
// [audio limits](https://cloud.google.com/speech/limits#content).
// [content limits](/speech-to-text/quotas#content).
bytes audio_content = 2;
}
}
Expand Down Expand Up @@ -218,36 +221,36 @@ message RecognitionConfig {
// Valid values for OGG_OPUS are '1'-'254'.
// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
// If `0` or omitted, defaults to one channel (mono).
// NOTE: We only recognize the first channel by default.
// Note: We only recognize the first channel by default.
// To perform independent recognition on each channel set
// enable_separate_recognition_per_channel to 'true'.
// `enable_separate_recognition_per_channel` to 'true'.
int32 audio_channel_count = 7;

// This needs to be set to ‘true’ explicitly and audio_channel_count > 1
// This needs to be set to ‘true’ explicitly and `audio_channel_count` > 1
// to get each channel recognized separately. The recognition result will
// contain a channel_tag field to state which channel that result belongs to.
// If this is not true, we will only recognize the first channel.
// NOTE: The request is also billed cumulatively for all channels recognized:
// (audio_channel_count times the audio length)
// contain a `channel_tag` field to state which channel that result belongs
// to. If this is not true, we will only recognize the first channel. The
// request is billed cumulatively for all channels recognized:
// `audio_channel_count` multiplied by the length of the audio.
bool enable_separate_recognition_per_channel = 12;

// *Required* The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// See [Language Support](https://cloud.google.com/speech/docs/languages)
// See [Language Support](/speech-to-text/docs/languages)
// for a list of the currently supported language codes.
string language_code = 3;

// *Optional* A list of up to 3 additional
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
// listing possible alternative languages of the supplied audio.
// See [Language Support](https://cloud.google.com/speech/docs/languages)
// See [Language Support](/speech-to-text/docs/languages)
// for a list of the currently supported language codes.
// If alternative languages are listed, recognition result will contain
// recognition in the most likely language detected including the main
// language_code. The recognition result will include the language tag
// of the language detected in the audio.
// NOTE: This feature is only supported for Voice Command and Voice Search
// Note: This feature is only supported for Voice Command and Voice Search
// use cases and performance may vary for other use cases (e.g., phone call
// transcription).
repeated string alternative_language_codes = 18;
Expand All @@ -266,7 +269,9 @@ message RecognitionConfig {
// won't be filtered out.
bool profanity_filter = 5;

// *Optional* A means to provide context to assist the speech recognition.
// *Optional* array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
repeated SpeechContext speech_contexts = 6;

// *Optional* If `true`, the top result includes a list of words and
Expand All @@ -284,18 +289,20 @@ message RecognitionConfig {
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all.
// The default 'false' value does not add punctuation to result hypotheses.
// NOTE: "This is currently offered as an experimental service, complimentary
// Note: This is currently offered as an experimental service, complimentary
// to all users. In the future this may be exclusively available as a
// premium feature."
// premium feature.
bool enable_automatic_punctuation = 11;

// *Optional* If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
// Note: When this is true, we send all the words from the beginning of the
// audio for the top alternative in every consecutive responses.
// audio for the top alternative in every consecutive STREAMING responses.
// This is done in order to improve our speaker tags as our models learn to
// identify the speakers in the conversation over time.
// For non-streaming requests, the diarization results will be provided only
// in the top alternative of the FINAL SpeechRecognitionResult.
bool enable_speaker_diarization = 16;

// *Optional*
Expand Down Expand Up @@ -342,14 +349,18 @@ message RecognitionConfig {
string model = 13;

// *Optional* Set to true to use an enhanced model for speech recognition.
// You must also set the `model` field to a valid, enhanced model. If
// `use_enhanced` is set to true and the `model` field is not set, then
// `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
// version of the specified model does not exist, then the speech is
// recognized using the standard version of the specified model.
// If `use_enhanced` is set to true and the `model` field is not set, then
// an appropriate enhanced model is chosen if:
// 1. project is eligible for requesting enhanced models
// 2. an enhanced model exists for the audio
//
// If `use_enhanced` is true and an enhanced version of the specified model
// does not exist, then the speech is recognized using the standard version
// of the specified model.
//
// Enhanced speech models require that you opt-in to the audio logging using
// instructions in the [alpha documentation](/speech/data-sharing). If you set
// Enhanced speech models require that you opt-in to data logging using
// instructions in the
// [documentation](/speech-to-text/docs/enable-data-logging). If you set
// `use_enhanced` to true and you have not enabled audio logging, then you
// will receive an error.
bool use_enhanced = 14;
Expand Down Expand Up @@ -494,14 +505,14 @@ message SpeechContext {
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](https://cloud.google.com/speech/limits#content).
// [usage limits](/speech-to-text/quotas#content).
repeated string phrases = 1;
}

// Contains audio data in the encoding specified in the `RecognitionConfig`.
// Either `content` or `uri` must be supplied. Supplying both or neither
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
// [audio limits](https://cloud.google.com/speech/limits#content).
// [content limits](/speech-to-text/quotas#content).
message RecognitionAudio {
// The audio source, which is either inline content or a Google Cloud
// Storage uri.
Expand All @@ -512,7 +523,8 @@ message RecognitionAudio {
bytes content = 1;

// URI that points to a file that contains audio data bytes as specified in
// `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
// `RecognitionConfig`. The file must not be compressed (for example, gzip).
// Currently, only Google Cloud Storage URIs are
// supported, which must be specified in the following format:
// `gs://bucket_name/object_name` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
Expand Down Expand Up @@ -658,6 +670,10 @@ message StreamingRecognitionResult {
// The default of 0.0 is a sentinel value indicating `stability` was not set.
float stability = 3;

// Output only. Time offset of the end of this result relative to the
// beginning of the audio.
google.protobuf.Duration result_end_time = 4;

// For multi-channel audio, this is the channel number corresponding to the
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
Expand Down Expand Up @@ -705,7 +721,7 @@ message SpeechRecognitionAlternative {
float confidence = 2;

// Output only. A list of word-specific information for each recognized word.
// Note: When enable_speaker_diarization is true, you will see all the words
// Note: When `enable_speaker_diarization` is true, you will see all the words
// from the beginning of the audio.
repeated WordInfo words = 3;
}
Expand Down Expand Up @@ -746,5 +762,4 @@ message WordInfo {
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
// top alternative.
int32 speaker_tag = 5;

}
6 changes: 3 additions & 3 deletions packages/google-cloud-speech/src/v1/speech_client_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@
},
"methods": {
"Recognize": {
"timeout_millis": 1000000,
"timeout_millis": 200000,
"retry_codes_name": "idempotent",
"retry_params_name": "default"
},
"LongRunningRecognize": {
"timeout_millis": 60000,
"timeout_millis": 200000,
"retry_codes_name": "non_idempotent",
"retry_params_name": "default"
},
"StreamingRecognize": {
"timeout_millis": 1000000,
"timeout_millis": 200000,
"retry_codes_name": "idempotent",
"retry_params_name": "default"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ const LongRunningRecognizeRequest = {
* `audio_content` data. The audio bytes must be encoded as specified in
* `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
* pure binary representation (not base64). See
* [audio limits](https://cloud.google.com/speech/limits#content).
* [content limits](https://cloud.google.com/speech-to-text/quotas#content).
*
* @typedef StreamingRecognizeRequest
* @memberof google.cloud.speech.v1p1beta1
Expand Down Expand Up @@ -156,36 +156,36 @@ const StreamingRecognitionConfig = {
* Valid values for OGG_OPUS are '1'-'254'.
* Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
* If `0` or omitted, defaults to one channel (mono).
* NOTE: We only recognize the first channel by default.
* Note: We only recognize the first channel by default.
* To perform independent recognition on each channel set
* enable_separate_recognition_per_channel to 'true'.
* `enable_separate_recognition_per_channel` to 'true'.
*
* @property {boolean} enableSeparateRecognitionPerChannel
* This needs to be set to ‘true’ explicitly and audio_channel_count > 1
* This needs to be set to ‘true’ explicitly and `audio_channel_count` > 1
* to get each channel recognized separately. The recognition result will
* contain a channel_tag field to state which channel that result belongs to.
* If this is not true, we will only recognize the first channel.
* NOTE: The request is also billed cumulatively for all channels recognized:
* (audio_channel_count times the audio length)
* contain a `channel_tag` field to state which channel that result belongs
* to. If this is not true, we will only recognize the first channel. The
* request is billed cumulatively for all channels recognized:
* `audio_channel_count` multiplied by the length of the audio.
*
* @property {string} languageCode
* *Required* The language of the supplied audio as a
* [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
* Example: "en-US".
* See [Language Support](https://cloud.google.com/speech/docs/languages)
* See [Language Support](https://cloud.google.com/speech-to-text/docs/languages)
* for a list of the currently supported language codes.
*
* @property {string[]} alternativeLanguageCodes
* *Optional* A list of up to 3 additional
* [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
* listing possible alternative languages of the supplied audio.
* See [Language Support](https://cloud.google.com/speech/docs/languages)
* See [Language Support](https://cloud.google.com/speech-to-text/docs/languages)
* for a list of the currently supported language codes.
* If alternative languages are listed, recognition result will contain
* recognition in the most likely language detected including the main
* language_code. The recognition result will include the language tag
* of the language detected in the audio.
* NOTE: This feature is only supported for Voice Command and Voice Search
* Note: This feature is only supported for Voice Command and Voice Search
* use cases and performance may vary for other use cases (e.g., phone call
* transcription).
*
Expand All @@ -204,7 +204,9 @@ const StreamingRecognitionConfig = {
* won't be filtered out.
*
* @property {Object[]} speechContexts
* *Optional* A means to provide context to assist the speech recognition.
* *Optional* array of SpeechContext.
* A means to provide context to assist the speech recognition. For more
* information, see [Phrase Hints](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints).
*
* This object should have the same structure as [SpeechContext]{@link google.cloud.speech.v1p1beta1.SpeechContext}
*
Expand All @@ -224,18 +226,20 @@ const StreamingRecognitionConfig = {
* This feature is only available in select languages. Setting this for
* requests in other languages has no effect at all.
* The default 'false' value does not add punctuation to result hypotheses.
* NOTE: "This is currently offered as an experimental service, complimentary
* Note: This is currently offered as an experimental service, complimentary
* to all users. In the future this may be exclusively available as a
* premium feature."
* premium feature.
*
* @property {boolean} enableSpeakerDiarization
* *Optional* If 'true', enables speaker detection for each recognized word in
* the top alternative of the recognition result using a speaker_tag provided
* in the WordInfo.
* Note: When this is true, we send all the words from the beginning of the
* audio for the top alternative in every consecutive responses.
* audio for the top alternative in every consecutive STREAMING responses.
* This is done in order to improve our speaker tags as our models learn to
* identify the speakers in the conversation over time.
* For non-streaming requests, the diarization results will be provided only
* in the top alternative of the FINAL SpeechRecognitionResult.
*
* @property {number} diarizationSpeakerCount
* *Optional*
Expand Down Expand Up @@ -284,14 +288,18 @@ const StreamingRecognitionConfig = {
*
* @property {boolean} useEnhanced
* *Optional* Set to true to use an enhanced model for speech recognition.
* You must also set the `model` field to a valid, enhanced model. If
* `use_enhanced` is set to true and the `model` field is not set, then
* `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
* version of the specified model does not exist, then the speech is
* recognized using the standard version of the specified model.
*
* Enhanced speech models require that you opt-in to the audio logging using
* instructions in the [alpha documentation](https://cloud.google.com/speech/data-sharing). If you set
* If `use_enhanced` is set to true and the `model` field is not set, then
* an appropriate enhanced model is chosen if:
* 1. project is eligible for requesting enhanced models
* 2. an enhanced model exists for the audio
*
* If `use_enhanced` is true and an enhanced version of the specified model
* does not exist, then the speech is recognized using the standard version
* of the specified model.
*
* Enhanced speech models require that you opt-in to data logging using
* instructions in the
* [documentation](https://cloud.google.com/speech-to-text/docs/enable-data-logging). If you set
* `use_enhanced` to true and you have not enabled audio logging, then you
* will receive an error.
*
Expand Down Expand Up @@ -617,7 +625,7 @@ const RecognitionMetadata = {
* to improve the accuracy for specific words and phrases, for example, if
* specific commands are typically spoken by the user. This can also be used
* to add additional words to the vocabulary of the recognizer. See
* [usage limits](https://cloud.google.com/speech/limits#content).
* [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
*
* @typedef SpeechContext
* @memberof google.cloud.speech.v1p1beta1
Expand All @@ -631,7 +639,7 @@ const SpeechContext = {
* Contains audio data in the encoding specified in the `RecognitionConfig`.
* Either `content` or `uri` must be supplied. Supplying both or neither
* returns google.rpc.Code.INVALID_ARGUMENT. See
* [audio limits](https://cloud.google.com/speech/limits#content).
* [content limits](https://cloud.google.com/speech-to-text/quotas#content).
*
* @property {string} content
* The audio data bytes encoded as specified in
Expand All @@ -640,7 +648,8 @@ const SpeechContext = {
*
* @property {string} uri
* URI that points to a file that contains audio data bytes as specified in
* `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
* `RecognitionConfig`. The file must not be compressed (for example, gzip).
* Currently, only Google Cloud Storage URIs are
* supported, which must be specified in the following format:
* `gs://bucket_name/object_name` (other URI formats return
* google.rpc.Code.INVALID_ARGUMENT). For more information, see
Expand Down Expand Up @@ -850,6 +859,12 @@ const StreamingRecognizeResponse = {
* This field is only provided for interim results (`is_final=false`).
* The default of 0.0 is a sentinel value indicating `stability` was not set.
*
* @property {Object} resultEndTime
* Output only. Time offset of the end of this result relative to the
* beginning of the audio.
*
* This object should have the same structure as [Duration]{@link google.protobuf.Duration}
*
* @property {number} channelTag
* For multi-channel audio, this is the channel number corresponding to the
* recognized result for the audio from that channel.
Expand Down Expand Up @@ -916,7 +931,7 @@ const SpeechRecognitionResult = {
*
* @property {Object[]} words
* Output only. A list of word-specific information for each recognized word.
* Note: When enable_speaker_diarization is true, you will see all the words
* Note: When `enable_speaker_diarization` is true, you will see all the words
* from the beginning of the audio.
*
* This object should have the same structure as [WordInfo]{@link google.cloud.speech.v1p1beta1.WordInfo}
Expand Down
Loading

0 comments on commit 3bb1eb2

Please sign in to comment.