diff --git a/packages/google-cloud-python-speech/google/cloud/speech/__init__.py b/packages/google-cloud-python-speech/google/cloud/speech/__init__.py index 4b7178005987..54e539cb4510 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech/__init__.py +++ b/packages/google-cloud-python-speech/google/cloud/speech/__init__.py @@ -35,6 +35,9 @@ from google.cloud.speech_v1.types.cloud_speech import StreamingRecognizeResponse from google.cloud.speech_v1.types.cloud_speech import TranscriptOutputConfig from google.cloud.speech_v1.types.cloud_speech import WordInfo +from google.cloud.speech_v1.types.resource import CustomClass +from google.cloud.speech_v1.types.resource import PhraseSet +from google.cloud.speech_v1.types.resource import SpeechAdaptation __all__ = ( "SpeechClient", @@ -57,4 +60,7 @@ "StreamingRecognizeResponse", "TranscriptOutputConfig", "WordInfo", + "CustomClass", + "PhraseSet", + "SpeechAdaptation", ) diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/__init__.py b/packages/google-cloud-python-speech/google/cloud/speech_v1/__init__.py index 5a9e4a6ab232..47592b254f4a 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1/__init__.py +++ b/packages/google-cloud-python-speech/google/cloud/speech_v1/__init__.py @@ -35,6 +35,9 @@ from .types.cloud_speech import StreamingRecognizeResponse from .types.cloud_speech import TranscriptOutputConfig from .types.cloud_speech import WordInfo +from .types.resource import CustomClass +from .types.resource import PhraseSet +from .types.resource import SpeechAdaptation from google.cloud.speech_v1.helpers import SpeechHelpers @@ -45,15 +48,18 @@ class SpeechClient(SpeechHelpers, SpeechClient): __all__ = ( "SpeechAsyncClient", + "CustomClass", "LongRunningRecognizeMetadata", "LongRunningRecognizeRequest", "LongRunningRecognizeResponse", + "PhraseSet", "RecognitionAudio", "RecognitionConfig", "RecognitionMetadata", "RecognizeRequest", "RecognizeResponse", "SpeakerDiarizationConfig", + "SpeechAdaptation", "SpeechClient", "SpeechContext", "SpeechRecognitionAlternative", diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/async_client.py b/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/async_client.py index 2ef1ea232611..fc6976e3384b 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/async_client.py +++ b/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/async_client.py @@ -58,6 +58,10 @@ class SpeechAsyncClient: DEFAULT_ENDPOINT = SpeechClient.DEFAULT_ENDPOINT DEFAULT_MTLS_ENDPOINT = SpeechClient.DEFAULT_MTLS_ENDPOINT + custom_class_path = staticmethod(SpeechClient.custom_class_path) + parse_custom_class_path = staticmethod(SpeechClient.parse_custom_class_path) + phrase_set_path = staticmethod(SpeechClient.phrase_set_path) + parse_phrase_set_path = staticmethod(SpeechClient.parse_phrase_set_path) common_billing_account_path = staticmethod(SpeechClient.common_billing_account_path) parse_common_billing_account_path = staticmethod( SpeechClient.parse_common_billing_account_path diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/client.py b/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/client.py index da6f7754647c..94879ecd7756 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/client.py +++ b/packages/google-cloud-python-speech/google/cloud/speech_v1/services/speech/client.py @@ -160,6 +160,38 @@ def transport(self) -> SpeechTransport: """ return self._transport + @staticmethod + def custom_class_path(project: str, location: str, custom_class: str,) -> str: + """Returns a fully-qualified custom_class string.""" + return "projects/{project}/locations/{location}/customClasses/{custom_class}".format( + project=project, location=location, custom_class=custom_class, + ) + + @staticmethod + def parse_custom_class_path(path: str) -> Dict[str, str]: + """Parses a custom_class path into its component segments.""" + m = re.match( + r"^projects/(?P.+?)/locations/(?P.+?)/customClasses/(?P.+?)$", + path, + ) + return m.groupdict() if m else {} + + @staticmethod + def phrase_set_path(project: str, location: str, phrase_set: str,) -> str: + """Returns a fully-qualified phrase_set string.""" + return "projects/{project}/locations/{location}/phraseSets/{phrase_set}".format( + project=project, location=location, phrase_set=phrase_set, + ) + + @staticmethod + def parse_phrase_set_path(path: str) -> Dict[str, str]: + """Parses a phrase_set path into its component segments.""" + m = re.match( + r"^projects/(?P.+?)/locations/(?P.+?)/phraseSets/(?P.+?)$", + path, + ) + return m.groupdict() if m else {} + @staticmethod def common_billing_account_path(billing_account: str,) -> str: """Returns a fully-qualified billing_account string.""" diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/types/__init__.py b/packages/google-cloud-python-speech/google/cloud/speech_v1/types/__init__.py index 858051afd543..4075fc315479 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1/types/__init__.py +++ b/packages/google-cloud-python-speech/google/cloud/speech_v1/types/__init__.py @@ -33,6 +33,11 @@ TranscriptOutputConfig, WordInfo, ) +from .resource import ( + CustomClass, + PhraseSet, + SpeechAdaptation, +) __all__ = ( "LongRunningRecognizeMetadata", @@ -53,4 +58,7 @@ "StreamingRecognizeResponse", "TranscriptOutputConfig", "WordInfo", + "CustomClass", + "PhraseSet", + "SpeechAdaptation", ) diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/types/cloud_speech.py b/packages/google-cloud-python-speech/google/cloud/speech_v1/types/cloud_speech.py index e5b382f9b4c8..d94c11a1a8dd 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1/types/cloud_speech.py +++ b/packages/google-cloud-python-speech/google/cloud/speech_v1/types/cloud_speech.py @@ -15,8 +15,10 @@ # import proto # type: ignore +from google.cloud.speech_v1.types import resource from google.protobuf import duration_pb2 # type: ignore from google.protobuf import timestamp_pb2 # type: ignore +from google.protobuf import wrappers_pb2 # type: ignore from google.rpc import status_pb2 # type: ignore @@ -245,6 +247,20 @@ class RecognitionConfig(proto.Message): language tag. Example: "en-US". See `Language Support `__ for a list of the currently supported language codes. + alternative_language_codes (Sequence[str]): + A list of up to 3 additional + `BCP-47 `__ + language tags, listing possible alternative languages of the + supplied audio. See `Language + Support `__ + for a list of the currently supported language codes. If + alternative languages are listed, recognition result will + contain recognition in the most likely language detected + including the main language_code. The recognition result + will include the language tag of the language detected in + the audio. Note: This feature is only supported for Voice + Command and Voice Search use cases and performance may vary + for other use cases (e.g., phone call transcription). max_alternatives (int): Maximum number of recognition hypotheses to be returned. Specifically, the maximum number of @@ -258,6 +274,12 @@ class RecognitionConfig(proto.Message): profanities, replacing all but the initial character in each filtered word with asterisks, e.g. "f***". If set to ``false`` or omitted, profanities won't be filtered out. + adaptation (google.cloud.speech_v1.types.SpeechAdaptation): + Speech adaptation configuration improves the accuracy of + speech recognition. For more information, see the `speech + adaptation `__ + documentation. When speech adaptation is set it supersedes + the ``speech_contexts`` field. speech_contexts (Sequence[google.cloud.speech_v1.types.SpeechContext]): Array of [SpeechContext][google.cloud.speech.v1.SpeechContext]. A @@ -269,6 +291,11 @@ class RecognitionConfig(proto.Message): start and end time offsets (timestamps) for those words. If ``false``, no word-level time offset information is returned. The default is ``false``. + enable_word_confidence (bool): + If ``true``, the top result includes a list of words and the + confidence for those words. If ``false``, no word-level + confidence information is returned. The default is + ``false``. enable_automatic_punctuation (bool): If 'true', adds punctuation to recognition result hypotheses. This feature is only @@ -276,6 +303,23 @@ class RecognitionConfig(proto.Message): requests in other languages has no effect at all. The default 'false' value does not add punctuation to result hypotheses. + enable_spoken_punctuation (google.protobuf.wrappers_pb2.BoolValue): + The spoken punctuation behavior for the call If not set, + uses default behavior based on model of choice e.g. + command_and_search will enable spoken punctuation by default + If 'true', replaces spoken punctuation with the + corresponding symbols in the request. For example, "how are + you question mark" becomes "how are you?". See + https://cloud.google.com/speech-to-text/docs/spoken-punctuation + for support. If 'false', spoken punctuation is not replaced. + enable_spoken_emojis (google.protobuf.wrappers_pb2.BoolValue): + The spoken emoji behavior for the call + If not set, uses default behavior based on model + of choice If 'true', adds spoken emoji + formatting for the request. This will replace + spoken emojis with the corresponding Unicode + symbols in the final transcript. If 'false', + spoken emojis are not replaced. diarization_config (google.cloud.speech_v1.types.SpeakerDiarizationConfig): Config to enable speaker diarization and set additional parameters to make diarization better @@ -352,7 +396,7 @@ class AudioEncoding(proto.Enum): codecs are used to capture or transmit audio, particularly if background noise is present. Lossy codecs include ``MULAW``, ``AMR``, ``AMR_WB``, ``OGG_OPUS``, ``SPEEX_WITH_HEADER_BYTE``, - ``MP3``. + ``MP3``, and ``WEBM_OPUS``. The ``FLAC`` and ``WAV`` audio file formats include a header that describes the included audio content. You can request recognition @@ -374,19 +418,31 @@ class AudioEncoding(proto.Enum): AMR_WB = 5 OGG_OPUS = 6 SPEEX_WITH_HEADER_BYTE = 7 + WEBM_OPUS = 9 encoding = proto.Field(proto.ENUM, number=1, enum=AudioEncoding,) sample_rate_hertz = proto.Field(proto.INT32, number=2,) audio_channel_count = proto.Field(proto.INT32, number=7,) enable_separate_recognition_per_channel = proto.Field(proto.BOOL, number=12,) language_code = proto.Field(proto.STRING, number=3,) + alternative_language_codes = proto.RepeatedField(proto.STRING, number=18,) max_alternatives = proto.Field(proto.INT32, number=4,) profanity_filter = proto.Field(proto.BOOL, number=5,) + adaptation = proto.Field( + proto.MESSAGE, number=20, message=resource.SpeechAdaptation, + ) speech_contexts = proto.RepeatedField( proto.MESSAGE, number=6, message="SpeechContext", ) enable_word_time_offsets = proto.Field(proto.BOOL, number=8,) + enable_word_confidence = proto.Field(proto.BOOL, number=15,) enable_automatic_punctuation = proto.Field(proto.BOOL, number=11,) + enable_spoken_punctuation = proto.Field( + proto.MESSAGE, number=22, message=wrappers_pb2.BoolValue, + ) + enable_spoken_emojis = proto.Field( + proto.MESSAGE, number=23, message=wrappers_pb2.BoolValue, + ) diarization_config = proto.Field( proto.MESSAGE, number=19, message="SpeakerDiarizationConfig", ) @@ -534,9 +590,21 @@ class SpeechContext(proto.Message): for every month of the year, using the $MONTH class improves the likelihood of correctly transcribing audio that includes months. + boost (float): + Hint Boost. Positive value will increase the probability + that a specific phrase will be recognized over other similar + sounding phrases. The higher the boost, the higher the + chance of false positive recognition as well. Negative boost + values would correspond to anti-biasing. Anti-biasing is not + enabled, so negative boost will simply be ignored. Though + ``boost`` can accept a wide range of positive values, most + use cases are best served with values between 0 and 20. We + recommend using a binary search approach to finding the + optimal value for your use case. """ phrases = proto.RepeatedField(proto.STRING, number=1,) + boost = proto.Field(proto.FLOAT, number=4,) class RecognitionAudio(proto.Message): @@ -617,6 +685,12 @@ class LongRunningRecognizeResponse(proto.Message): total_billed_time (google.protobuf.duration_pb2.Duration): When available, billed audio seconds for the corresponding request. + output_config (google.cloud.speech_v1.types.TranscriptOutputConfig): + Original output config if present in the + request. + output_error (google.rpc.status_pb2.Status): + If the transcript output fails this field + contains the relevant error. """ results = proto.RepeatedField( @@ -625,6 +699,10 @@ class LongRunningRecognizeResponse(proto.Message): total_billed_time = proto.Field( proto.MESSAGE, number=3, message=duration_pb2.Duration, ) + output_config = proto.Field( + proto.MESSAGE, number=6, message="TranscriptOutputConfig", + ) + output_error = proto.Field(proto.MESSAGE, number=7, message=status_pb2.Status,) class LongRunningRecognizeMetadata(proto.Message): @@ -777,7 +855,7 @@ class StreamingRecognitionResult(proto.Message): that channel. For audio_channel_count = N, its output values can range from '1' to 'N'. language_code (str): - The + Output only. The `BCP-47 `__ language tag of the language in this result. This language code was detected to have the most likelihood of being @@ -812,12 +890,25 @@ class SpeechRecognitionResult(proto.Message): corresponding to the recognized result for the audio from that channel. For audio_channel_count = N, its output values can range from '1' to 'N'. + result_end_time (google.protobuf.duration_pb2.Duration): + Time offset of the end of this result + relative to the beginning of the audio. + language_code (str): + Output only. The + `BCP-47 `__ + language tag of the language in this result. This language + code was detected to have the most likelihood of being + spoken in the audio. """ alternatives = proto.RepeatedField( proto.MESSAGE, number=1, message="SpeechRecognitionAlternative", ) channel_tag = proto.Field(proto.INT32, number=2,) + result_end_time = proto.Field( + proto.MESSAGE, number=4, message=duration_pb2.Duration, + ) + language_code = proto.Field(proto.STRING, number=5,) class SpeechRecognitionAlternative(proto.Message): @@ -866,6 +957,15 @@ class WordInfo(proto.Message): word (str): The word corresponding to this set of information. + confidence (float): + The confidence estimate between 0.0 and 1.0. A higher number + indicates an estimated greater likelihood that the + recognized words are correct. This field is set only for the + top alternative of a non-streaming result or, of a streaming + result where ``is_final=true``. This field is not guaranteed + to be accurate and users should not rely on it to be always + provided. The default of 0.0 is a sentinel value indicating + ``confidence`` was not set. speaker_tag (int): Output only. A distinct integer value is assigned for every speaker within the audio. This field specifies which one of @@ -878,6 +978,7 @@ class WordInfo(proto.Message): start_time = proto.Field(proto.MESSAGE, number=1, message=duration_pb2.Duration,) end_time = proto.Field(proto.MESSAGE, number=2, message=duration_pb2.Duration,) word = proto.Field(proto.STRING, number=3,) + confidence = proto.Field(proto.FLOAT, number=4,) speaker_tag = proto.Field(proto.INT32, number=5,) diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/types/resource.py b/packages/google-cloud-python-speech/google/cloud/speech_v1/types/resource.py new file mode 100644 index 000000000000..c7286b81ec2c --- /dev/null +++ b/packages/google-cloud-python-speech/google/cloud/speech_v1/types/resource.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import proto # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.speech.v1", + manifest={"CustomClass", "PhraseSet", "SpeechAdaptation",}, +) + + +class CustomClass(proto.Message): + r"""A set of words or phrases that represents a common concept + likely to appear in your audio, for example a list of passenger + ship names. CustomClass items can be substituted into + placeholders that you set in PhraseSet phrases. + + Attributes: + name (str): + The resource name of the custom class. + custom_class_id (str): + If this custom class is a resource, the custom_class_id is + the resource id of the CustomClass. Case sensitive. + items (Sequence[google.cloud.speech_v1.types.CustomClass.ClassItem]): + A collection of class items. + """ + + class ClassItem(proto.Message): + r"""An item of the class. + + Attributes: + value (str): + The class item's value. + """ + + value = proto.Field(proto.STRING, number=1,) + + name = proto.Field(proto.STRING, number=1,) + custom_class_id = proto.Field(proto.STRING, number=2,) + items = proto.RepeatedField(proto.MESSAGE, number=3, message=ClassItem,) + + +class PhraseSet(proto.Message): + r"""Provides "hints" to the speech recognizer to favor specific + words and phrases in the results. + + Attributes: + name (str): + The resource name of the phrase set. + phrases (Sequence[google.cloud.speech_v1.types.PhraseSet.Phrase]): + A list of word and phrases. + boost (float): + Hint Boost. Positive value will increase the probability + that a specific phrase will be recognized over other similar + sounding phrases. The higher the boost, the higher the + chance of false positive recognition as well. Negative boost + values would correspond to anti-biasing. Anti-biasing is not + enabled, so negative boost will simply be ignored. Though + ``boost`` can accept a wide range of positive values, most + use cases are best served with values between 0 (exclusive) + and 20. We recommend using a binary search approach to + finding the optimal value for your use case. Speech + recognition will skip PhraseSets with a boost value of 0. + """ + + class Phrase(proto.Message): + r"""A phrases containing words and phrase "hints" so that the speech + recognition is more likely to recognize them. This can be used to + improve the accuracy for specific words and phrases, for example, if + specific commands are typically spoken by the user. This can also be + used to add additional words to the vocabulary of the recognizer. + See `usage + limits `__. + + List items can also include pre-built or custom classes containing + groups of words that represent common concepts that occur in natural + language. For example, rather than providing a phrase hint for every + month of the year (e.g. "i was born in january", "i was born in + febuary", ...), use the pre-built ``$MONTH`` class improves the + likelihood of correctly transcribing audio that includes months + (e.g. "i was born in $month"). To refer to pre-built classes, use + the class' symbol prepended with ``$`` e.g. ``$MONTH``. To refer to + custom classes that were defined inline in the request, set the + class's ``custom_class_id`` to a string unique to all class + resources and inline classes. Then use the class' id wrapped in + $\ ``{...}`` e.g. "${my-months}". To refer to custom classes + resources, use the class' id wrapped in ``${}`` (e.g. + ``${my-months}``). + + Speech-to-Text supports three locations: ``global``, ``us`` (US + North America), and ``eu`` (Europe). If you are calling the + ``speech.googleapis.com`` endpoint, use the ``global`` location. To + specify a region, use a `regional + endpoint `__ with matching ``us`` or + ``eu`` location value. + + Attributes: + value (str): + The phrase itself. + boost (float): + Hint Boost. Overrides the boost set at the phrase set level. + Positive value will increase the probability that a specific + phrase will be recognized over other similar sounding + phrases. The higher the boost, the higher the chance of + false positive recognition as well. Negative boost will + simply be ignored. Though ``boost`` can accept a wide range + of positive values, most use cases are best served with + values between 0 and 20. We recommend using a binary search + approach to finding the optimal value for your use case. + Speech recognition will skip PhraseSets with a boost value + of 0. + """ + + value = proto.Field(proto.STRING, number=1,) + boost = proto.Field(proto.FLOAT, number=2,) + + name = proto.Field(proto.STRING, number=1,) + phrases = proto.RepeatedField(proto.MESSAGE, number=2, message=Phrase,) + boost = proto.Field(proto.FLOAT, number=4,) + + +class SpeechAdaptation(proto.Message): + r"""Speech adaptation configuration. + + Attributes: + phrase_sets (Sequence[google.cloud.speech_v1.types.PhraseSet]): + A collection of phrase sets. To specify the hints inline, + leave the phrase set's ``name`` blank and fill in the rest + of its fields. Any phrase set can use any custom class. + phrase_set_references (Sequence[str]): + A collection of phrase set resource names to + use. + custom_classes (Sequence[google.cloud.speech_v1.types.CustomClass]): + A collection of custom classes. To specify the classes + inline, leave the class' ``name`` blank and fill in the rest + of its fields, giving it a unique ``custom_class_id``. Refer + to the inline defined class in phrase hints by its + ``custom_class_id``. + """ + + phrase_sets = proto.RepeatedField(proto.MESSAGE, number=1, message="PhraseSet",) + phrase_set_references = proto.RepeatedField(proto.STRING, number=2,) + custom_classes = proto.RepeatedField( + proto.MESSAGE, number=3, message="CustomClass", + ) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/packages/google-cloud-python-speech/tests/unit/gapic/speech_v1/test_speech.py b/packages/google-cloud-python-speech/tests/unit/gapic/speech_v1/test_speech.py index 6d6e0c1fd77b..9d5638991c57 100644 --- a/packages/google-cloud-python-speech/tests/unit/gapic/speech_v1/test_speech.py +++ b/packages/google-cloud-python-speech/tests/unit/gapic/speech_v1/test_speech.py @@ -38,9 +38,11 @@ from google.cloud.speech_v1.services.speech import SpeechClient from google.cloud.speech_v1.services.speech import transports from google.cloud.speech_v1.types import cloud_speech +from google.cloud.speech_v1.types import resource from google.longrunning import operations_pb2 from google.oauth2 import service_account from google.protobuf import duration_pb2 # type: ignore +from google.protobuf import wrappers_pb2 # type: ignore from google.rpc import status_pb2 # type: ignore import google.auth @@ -1288,6 +1290,54 @@ def test_speech_grpc_lro_async_client(): assert transport.operations_client is transport.operations_client +def test_custom_class_path(): + project = "squid" + location = "clam" + custom_class = "whelk" + expected = "projects/{project}/locations/{location}/customClasses/{custom_class}".format( + project=project, location=location, custom_class=custom_class, + ) + actual = SpeechClient.custom_class_path(project, location, custom_class) + assert expected == actual + + +def test_parse_custom_class_path(): + expected = { + "project": "octopus", + "location": "oyster", + "custom_class": "nudibranch", + } + path = SpeechClient.custom_class_path(**expected) + + # Check that the path construction is reversible. + actual = SpeechClient.parse_custom_class_path(path) + assert expected == actual + + +def test_phrase_set_path(): + project = "cuttlefish" + location = "mussel" + phrase_set = "winkle" + expected = "projects/{project}/locations/{location}/phraseSets/{phrase_set}".format( + project=project, location=location, phrase_set=phrase_set, + ) + actual = SpeechClient.phrase_set_path(project, location, phrase_set) + assert expected == actual + + +def test_parse_phrase_set_path(): + expected = { + "project": "nautilus", + "location": "scallop", + "phrase_set": "abalone", + } + path = SpeechClient.phrase_set_path(**expected) + + # Check that the path construction is reversible. + actual = SpeechClient.parse_phrase_set_path(path) + assert expected == actual + + def test_common_billing_account_path(): billing_account = "squid" expected = "billingAccounts/{billing_account}".format(