feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V…

…1 API (#4775) * feat: add transcript normalization + m4a audio format support docs: clarify alternatives for deprecated fields feat: deprecate `BatchRecognizeFileResult.uri` in favor of `cloud_storage_result.native_format_uri` feat: deprecate `BatchRecognizeFileResult.transcript` in favor of `inline_result.transcript` PiperOrigin-RevId: 577926708 Source-Link: googleapis/googleapis@37e816b Source-Link: googleapis/googleapis-gen@e12bd7b Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiZTEyYmQ3YmRiYmI5ZDJlNDE4YTkyMjA3NWQyM2Y3N2E4YzFlNzQ4NSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V1 API PiperOrigin-RevId: 578629599 Source-Link: googleapis/googleapis@08facab Source-Link: googleapis/googleapis-gen@75903e0 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiNzU5MDNlMGZlNjk1OTAwZjY4NGM3MmNhOGI1YjllNmJjMTYwMDQ4YSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix(ci): log changes when CI changes are detected --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Alexander Fenster <[email protected]>
googleapis · Nov 3, 2023 · 10e783c · 10e783c
1 parent ca49756
commit 10e783c
Show file tree

Hide file tree

Showing 7 changed files with 2,485 additions and 81 deletions.
diff --git a/ci/run_conditional_tests.sh b/ci/run_conditional_tests.sh
@@ -62,6 +62,8 @@ if [[ "${changed}" -eq 0 ]]; then
     echo "no change detected in ci"
 else
     echo "change detected in ci, we should test everything"
+    echo "result of git diff ${GIT_DIFF_ARG} ci:"
+    git diff ${GIT_DIFF_ARG} ci
     GIT_DIFF_ARG=""
 fi
 

diff --git a/packages/google-cloud-speech/protos/google/cloud/speech/v1/cloud_speech.proto b/packages/google-cloud-speech/protos/google/cloud/speech/v1/cloud_speech.proto
@@ -1,4 +1,4 @@
-// Copyright 2022 Google LLC
+// Copyright 2023 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -260,6 +260,12 @@ message RecognitionConfig {
     // wideband is supported. `sample_rate_hertz` must be 16000.
     SPEEX_WITH_HEADER_BYTE = 7;
 
+    // MP3 audio. MP3 encoding is a Beta feature and only available in
+    // v1p1beta1. Support all standard MP3 bitrates (which range from 32-320
+    // kbps). When using this encoding, `sample_rate_hertz` has to match the
+    // sample rate of the file being used.
+    MP3 = 8;
+
     // Opus encoded audio frames in WebM container
     // ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
     // one of 8000, 12000, 16000, 24000, or 48000.
@@ -343,6 +349,13 @@ message RecognitionConfig {
   // When speech adaptation is set it supersedes the `speech_contexts` field.
   SpeechAdaptation adaptation = 20;
 
+  // Optional. Use transcription normalization to automatically replace parts of
+  // the transcript with phrases of your choosing. For StreamingRecognize, this
+  // normalization only applies to stable partial transcripts (stability > 0.8)
+  // and final transcripts.
+  TranscriptNormalization transcript_normalization = 24
+      [(google.api.field_behavior) = OPTIONAL];
+
   // Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
   // A means to provide context to assist the speech recognition. For more
   // information, see
@@ -463,8 +476,8 @@ message RecognitionConfig {
 // Config to enable speaker diarization.
 message SpeakerDiarizationConfig {
   // If 'true', enables speaker detection for each recognized word in
-  // the top alternative of the recognition result using a speaker_tag provided
-  // in the WordInfo.
+  // the top alternative of the recognition result using a speaker_label
+  // provided in the WordInfo.
   bool enable_speaker_diarization = 1;
 
   // Minimum number of speakers in the conversation. This range gives you more
@@ -956,9 +969,19 @@ message WordInfo {
   // Output only. A distinct integer value is assigned for every speaker within
   // the audio. This field specifies which one of those speakers was detected to
   // have spoken this word. Value ranges from '1' to diarization_speaker_count.
-  // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+  // speaker_tag is set if enable_speaker_diarization = 'true' and only for the
   // top alternative.
-  int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
+  // Note: Use speaker_label instead.
+  int32 speaker_tag = 5
+      [deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
+
+  // Output only. A label value assigned for every unique speaker within the
+  // audio. This field specifies which speaker was detected to have spoken this
+  // word. For some models, like medical_conversation this can be actual speaker
+  // role, for example "patient" or "provider", but generally this would be a
+  // number identifying a speaker. This field is only set if
+  // enable_speaker_diarization = 'true' and only for the top alternative.
+  string speaker_label = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
 }
 
 // Information on speech adaptation use in results

diff --git a/packages/google-cloud-speech/protos/google/cloud/speech/v1/resource.proto b/packages/google-cloud-speech/protos/google/cloud/speech/v1/resource.proto
@@ -1,4 +1,4 @@
-// Copyright 2022 Google LLC
+// Copyright 2023 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -147,3 +147,27 @@ message SpeechAdaptation {
   // See specifications: https://www.w3.org/TR/speech-grammar
   ABNFGrammar abnf_grammar = 4;
 }
+
+// Transcription normalization configuration. Use transcription normalization
+// to automatically replace parts of the transcript with phrases of your
+// choosing. For StreamingRecognize, this normalization only applies to stable
+// partial transcripts (stability > 0.8) and final transcripts.
+message TranscriptNormalization {
+  // A single replacement configuration.
+  message Entry {
+    // What to replace. Max length is 100 characters.
+    string search = 1;
+
+    // What to replace with. Max length is 100 characters.
+    string replace = 2;
+
+    // Whether the search is case sensitive.
+    bool case_sensitive = 3;
+  }
+
+  // A list of replacement entries. We will perform replacement with one entry
+  // at a time. For example, the second entry in ["cat" => "dog", "mountain cat"
+  // => "mountain dog"] will never be applied because we will always process the
+  // first entry before it. At most 100 entries.
+  repeated Entry entries = 1;
+}
diff --git a/packages/google-cloud-speech/protos/google/cloud/speech/v2/cloud_speech.proto b/packages/google-cloud-speech/protos/google/cloud/speech/v2/cloud_speech.proto
@@ -585,8 +585,12 @@ message Recognizer {
   // characters or less.
   string display_name = 3;
 
-  // Optional. Which model to use for recognition requests. Select the model
-  // best suited to your domain to get best results.
+  // Optional. This field is now deprecated. Prefer the
+  // [`model`][google.cloud.speech.v2.RecognitionConfig.model] field in the
+  // [`RecognitionConfig`][google.cloud.speech.v2.RecognitionConfig] message.
+  //
+  // Which model to use for recognition requests. Select the model best suited
+  // to your domain to get best results.
   //
   // Guidance for choosing which model to use can be found in the [Transcription
   // Models
@@ -596,7 +600,12 @@ message Recognizer {
   // Models](https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages).
   string model = 4 [deprecated = true, (google.api.field_behavior) = OPTIONAL];
 
-  // Optional. The language of the supplied audio as a
+  // Optional. This field is now deprecated. Prefer the
+  // [`language_codes`][google.cloud.speech.v2.RecognitionConfig.language_codes]
+  // field in the
+  // [`RecognitionConfig`][google.cloud.speech.v2.RecognitionConfig] message.
+  //
+  // The language of the supplied audio as a
   // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
   //
   // Supported languages for each model are listed in the [Table of Supported
@@ -693,6 +702,8 @@ message Recognizer {
 // * OGG_OPUS: Opus audio frames in an Ogg container.
 //
 // * WEBM_OPUS: Opus audio frames in a WebM container.
+//
+// * M4A: M4A audio format.
 message AutoDetectDecodingConfig {}
 
 // Explicitly specified decoding parameters.
@@ -827,6 +838,30 @@ message RecognitionFeatures {
   int32 max_alternatives = 16;
 }
 
+// Transcription normalization configuration. Use transcription normalization
+// to automatically replace parts of the transcript with phrases of your
+// choosing. For StreamingRecognize, this normalization only applies to stable
+// partial transcripts (stability > 0.8) and final transcripts.
+message TranscriptNormalization {
+  // A single replacement configuration.
+  message Entry {
+    // What to replace. Max length is 100 characters.
+    string search = 1;
+
+    // What to replace with. Max length is 100 characters.
+    string replace = 2;
+
+    // Whether the search is case sensitive.
+    bool case_sensitive = 3;
+  }
+
+  // A list of replacement entries. We will perform replacement with one entry
+  // at a time. For example, the second entry in ["cat" => "dog", "mountain cat"
+  // => "mountain dog"] will never be applied because we will always process the
+  // first entry before it. At most 100 entries.
+  repeated Entry entries = 1;
+}
+
 // Provides "hints" to the speech recognizer to favor specific words and phrases
 // in the results. PhraseSets can be specified as an inline resource, or a
 // reference to an existing PhraseSet resource.
@@ -898,6 +933,13 @@ message RecognitionConfig {
   // Speech adaptation context that weights recognizer predictions for specific
   // words and phrases.
   SpeechAdaptation adaptation = 6;
+
+  // Optional. Use transcription normalization to automatically replace parts of
+  // the transcript with phrases of your choosing. For StreamingRecognize, this
+  // normalization only applies to stable partial transcripts (stability > 0.8)
+  // and final transcripts.
+  TranscriptNormalization transcript_normalization = 11
+      [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Request message for the
@@ -1275,21 +1317,45 @@ message BatchRecognizeResults {
   RecognitionResponseMetadata metadata = 2;
 }
 
-// Final results for a single file.
-message BatchRecognizeFileResult {
+// Final results written to Cloud Storage.
+message CloudStorageResult {
   // The Cloud Storage URI to which recognition results were written.
   string uri = 1;
+}
+
+// Final results returned inline in the recognition response.
+message InlineResult {
+  // The transcript for the audio file.
+  BatchRecognizeResults transcript = 1;
+}
 
+// Final results for a single file.
+message BatchRecognizeFileResult {
   // Error if one was encountered.
   google.rpc.Status error = 2;
 
   RecognitionResponseMetadata metadata = 3;
 
-  // The transcript for the audio file. This is populated only when
-  // [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
-  // the
-  // [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
-  BatchRecognizeResults transcript = 4;
+  oneof result {
+    // Recognition results written to Cloud Storage. This is
+    // populated only when
+    // [GcsOutputConfig][google.cloud.speech.v2.GcsOutputConfig] is set in
+    // the
+    // [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
+    CloudStorageResult cloud_storage_result = 5;
+
+    // Recognition results. This is populated only when
+    // [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
+    // the
+    // [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
+    InlineResult inline_result = 6;
+  }
+
+  // Deprecated. Use `cloud_storage_result.native_format_uri` instead.
+  string uri = 1 [deprecated = true];
+
+  // Deprecated. Use `inline_result.transcript` instead.
+  BatchRecognizeResults transcript = 4 [deprecated = true];
 }
 
 // Metadata about transcription for a single file (for example, progress