Skip to content

Commit

Permalink
feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V…
Browse files Browse the repository at this point in the history
…1 API (#4775)

* feat: add transcript normalization + m4a audio format support

docs: clarify alternatives for deprecated fields

feat: deprecate `BatchRecognizeFileResult.uri` in favor of `cloud_storage_result.native_format_uri`

feat: deprecate `BatchRecognizeFileResult.transcript` in favor of `inline_result.transcript`
PiperOrigin-RevId: 577926708

Source-Link: googleapis/googleapis@37e816b

Source-Link: googleapis/googleapis-gen@e12bd7b
Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiZTEyYmQ3YmRiYmI5ZDJlNDE4YTkyMjA3NWQyM2Y3N2E4YzFlNzQ4NSJ9

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V1 API

PiperOrigin-RevId: 578629599

Source-Link: googleapis/googleapis@08facab

Source-Link: googleapis/googleapis-gen@75903e0
Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiNzU5MDNlMGZlNjk1OTAwZjY4NGM3MmNhOGI1YjllNmJjMTYwMDQ4YSJ9

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* fix(ci): log changes when CI changes are detected

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Co-authored-by: Alexander Fenster <[email protected]>
  • Loading branch information
3 people authored Nov 3, 2023
1 parent ca49756 commit 10e783c
Show file tree
Hide file tree
Showing 7 changed files with 2,485 additions and 81 deletions.
2 changes: 2 additions & 0 deletions ci/run_conditional_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ if [[ "${changed}" -eq 0 ]]; then
echo "no change detected in ci"
else
echo "change detected in ci, we should test everything"
echo "result of git diff ${GIT_DIFF_ARG} ci:"
git diff ${GIT_DIFF_ARG} ci
GIT_DIFF_ARG=""
fi

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022 Google LLC
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -260,6 +260,12 @@ message RecognitionConfig {
// wideband is supported. `sample_rate_hertz` must be 16000.
SPEEX_WITH_HEADER_BYTE = 7;

// MP3 audio. MP3 encoding is a Beta feature and only available in
// v1p1beta1. Support all standard MP3 bitrates (which range from 32-320
// kbps). When using this encoding, `sample_rate_hertz` has to match the
// sample rate of the file being used.
MP3 = 8;

// Opus encoded audio frames in WebM container
// ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
// one of 8000, 12000, 16000, 24000, or 48000.
Expand Down Expand Up @@ -343,6 +349,13 @@ message RecognitionConfig {
// When speech adaptation is set it supersedes the `speech_contexts` field.
SpeechAdaptation adaptation = 20;

// Optional. Use transcription normalization to automatically replace parts of
// the transcript with phrases of your choosing. For StreamingRecognize, this
// normalization only applies to stable partial transcripts (stability > 0.8)
// and final transcripts.
TranscriptNormalization transcript_normalization = 24
[(google.api.field_behavior) = OPTIONAL];

// Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
Expand Down Expand Up @@ -463,8 +476,8 @@ message RecognitionConfig {
// Config to enable speaker diarization.
message SpeakerDiarizationConfig {
// If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
// the top alternative of the recognition result using a speaker_label
// provided in the WordInfo.
bool enable_speaker_diarization = 1;

// Minimum number of speakers in the conversation. This range gives you more
Expand Down Expand Up @@ -956,9 +969,19 @@ message WordInfo {
// Output only. A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
// speaker_tag is set if enable_speaker_diarization = 'true' and only for the
// top alternative.
int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
// Note: Use speaker_label instead.
int32 speaker_tag = 5
[deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];

// Output only. A label value assigned for every unique speaker within the
// audio. This field specifies which speaker was detected to have spoken this
// word. For some models, like medical_conversation this can be actual speaker
// role, for example "patient" or "provider", but generally this would be a
// number identifying a speaker. This field is only set if
// enable_speaker_diarization = 'true' and only for the top alternative.
string speaker_label = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Information on speech adaptation use in results
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022 Google LLC
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -147,3 +147,27 @@ message SpeechAdaptation {
// See specifications: https://www.w3.org/TR/speech-grammar
ABNFGrammar abnf_grammar = 4;
}

// Transcription normalization configuration. Use transcription normalization
// to automatically replace parts of the transcript with phrases of your
// choosing. For StreamingRecognize, this normalization only applies to stable
// partial transcripts (stability > 0.8) and final transcripts.
message TranscriptNormalization {
// A single replacement configuration.
message Entry {
// What to replace. Max length is 100 characters.
string search = 1;

// What to replace with. Max length is 100 characters.
string replace = 2;

// Whether the search is case sensitive.
bool case_sensitive = 3;
}

// A list of replacement entries. We will perform replacement with one entry
// at a time. For example, the second entry in ["cat" => "dog", "mountain cat"
// => "mountain dog"] will never be applied because we will always process the
// first entry before it. At most 100 entries.
repeated Entry entries = 1;
}
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,12 @@ message Recognizer {
// characters or less.
string display_name = 3;

// Optional. Which model to use for recognition requests. Select the model
// best suited to your domain to get best results.
// Optional. This field is now deprecated. Prefer the
// [`model`][google.cloud.speech.v2.RecognitionConfig.model] field in the
// [`RecognitionConfig`][google.cloud.speech.v2.RecognitionConfig] message.
//
// Which model to use for recognition requests. Select the model best suited
// to your domain to get best results.
//
// Guidance for choosing which model to use can be found in the [Transcription
// Models
Expand All @@ -596,7 +600,12 @@ message Recognizer {
// Models](https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages).
string model = 4 [deprecated = true, (google.api.field_behavior) = OPTIONAL];

// Optional. The language of the supplied audio as a
// Optional. This field is now deprecated. Prefer the
// [`language_codes`][google.cloud.speech.v2.RecognitionConfig.language_codes]
// field in the
// [`RecognitionConfig`][google.cloud.speech.v2.RecognitionConfig] message.
//
// The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
//
// Supported languages for each model are listed in the [Table of Supported
Expand Down Expand Up @@ -693,6 +702,8 @@ message Recognizer {
// * OGG_OPUS: Opus audio frames in an Ogg container.
//
// * WEBM_OPUS: Opus audio frames in a WebM container.
//
// * M4A: M4A audio format.
message AutoDetectDecodingConfig {}

// Explicitly specified decoding parameters.
Expand Down Expand Up @@ -827,6 +838,30 @@ message RecognitionFeatures {
int32 max_alternatives = 16;
}

// Transcription normalization configuration. Use transcription normalization
// to automatically replace parts of the transcript with phrases of your
// choosing. For StreamingRecognize, this normalization only applies to stable
// partial transcripts (stability > 0.8) and final transcripts.
message TranscriptNormalization {
// A single replacement configuration.
message Entry {
// What to replace. Max length is 100 characters.
string search = 1;

// What to replace with. Max length is 100 characters.
string replace = 2;

// Whether the search is case sensitive.
bool case_sensitive = 3;
}

// A list of replacement entries. We will perform replacement with one entry
// at a time. For example, the second entry in ["cat" => "dog", "mountain cat"
// => "mountain dog"] will never be applied because we will always process the
// first entry before it. At most 100 entries.
repeated Entry entries = 1;
}

// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results. PhraseSets can be specified as an inline resource, or a
// reference to an existing PhraseSet resource.
Expand Down Expand Up @@ -898,6 +933,13 @@ message RecognitionConfig {
// Speech adaptation context that weights recognizer predictions for specific
// words and phrases.
SpeechAdaptation adaptation = 6;

// Optional. Use transcription normalization to automatically replace parts of
// the transcript with phrases of your choosing. For StreamingRecognize, this
// normalization only applies to stable partial transcripts (stability > 0.8)
// and final transcripts.
TranscriptNormalization transcript_normalization = 11
[(google.api.field_behavior) = OPTIONAL];
}

// Request message for the
Expand Down Expand Up @@ -1275,21 +1317,45 @@ message BatchRecognizeResults {
RecognitionResponseMetadata metadata = 2;
}

// Final results for a single file.
message BatchRecognizeFileResult {
// Final results written to Cloud Storage.
message CloudStorageResult {
// The Cloud Storage URI to which recognition results were written.
string uri = 1;
}

// Final results returned inline in the recognition response.
message InlineResult {
// The transcript for the audio file.
BatchRecognizeResults transcript = 1;
}

// Final results for a single file.
message BatchRecognizeFileResult {
// Error if one was encountered.
google.rpc.Status error = 2;

RecognitionResponseMetadata metadata = 3;

// The transcript for the audio file. This is populated only when
// [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
// the
// [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
BatchRecognizeResults transcript = 4;
oneof result {
// Recognition results written to Cloud Storage. This is
// populated only when
// [GcsOutputConfig][google.cloud.speech.v2.GcsOutputConfig] is set in
// the
// [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
CloudStorageResult cloud_storage_result = 5;

// Recognition results. This is populated only when
// [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig] is set in
// the
// [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
InlineResult inline_result = 6;
}

// Deprecated. Use `cloud_storage_result.native_format_uri` instead.
string uri = 1 [deprecated = true];

// Deprecated. Use `inline_result.transcript` instead.
BatchRecognizeResults transcript = 4 [deprecated = true];
}

// Metadata about transcription for a single file (for example, progress
Expand Down
Loading

0 comments on commit 10e783c

Please sign in to comment.