From 58e245444e386c6bc690e0efdbcdd3d76a8476f4 Mon Sep 17 00:00:00 2001 From: Google Colaboratory Team Date: Thu, 21 Dec 2023 13:33:58 -0800 Subject: [PATCH] No public description PiperOrigin-RevId: 592938883 --- ...ini_with_Google's_Speech_to_Text_API.ipynb | 559 ++++++++++++++++++ 1 file changed, 559 insertions(+) create mode 100644 notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb diff --git a/notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb b/notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb new file mode 100644 index 00000000..1193ae8f --- /dev/null +++ b/notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb @@ -0,0 +1,559 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "fa2fe94c30c644deb009e4aac464c93a": { + "model_module": "jupyter-webrtc", + "model_name": "AudioRecorderModel", + "model_module_version": "~0.6.0", + "state": { + "_data_src": "blob:https://5t7fh9enxfp-496ff2e9c6d22116-0-colab.googleusercontent.com/c0619ff6-ae72-4ab9-9908-623b31d805b5", + "_dom_classes": [], + "_model_module": "jupyter-webrtc", + "_model_module_version": "~0.6.0", + "_model_name": "AudioRecorderModel", + "_view_count": null, + "_view_module": "jupyter-webrtc", + "_view_module_version": "~0.6.0", + "_view_name": "AudioRecorderView", + "audio": "IPY_MODEL_b2753970c7554b2e8a6b3da182354c7f", + "autosave": false, + "codecs": "", + "filename": "record", + "format": "webm", + "layout": "IPY_MODEL_6174abce27bf45f5ab1e57ac3efa6d46", + "recording": false, + "stream": "IPY_MODEL_202e08b79fbf48b59d7f65220da3a9e4" + } + }, + "b2753970c7554b2e8a6b3da182354c7f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "AudioModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "AudioModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "AudioView", + "autoplay": true, + "controls": true, + "format": "webm", + "layout": "IPY_MODEL_c09fa990067149ef892f09750c4c3224", + "loop": true + } + }, + "6174abce27bf45f5ab1e57ac3efa6d46": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "202e08b79fbf48b59d7f65220da3a9e4": { + "model_module": "jupyter-webrtc", + "model_name": "CameraStreamModel", + "model_module_version": "~0.6.0", + "state": { + "_dom_classes": [], + "_model_module": "jupyter-webrtc", + "_model_module_version": "~0.6.0", + "_model_name": "CameraStreamModel", + "_view_count": null, + "_view_module": "jupyter-webrtc", + "_view_module_version": "~0.6.0", + "_view_name": "MediaStreamView", + "constraints": { + "audio": true, + "video": false + }, + "layout": "IPY_MODEL_b7ca2f71f73a414e85b5451b29516d2d" + } + }, + "c09fa990067149ef892f09750c4c3224": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b7ca2f71f73a414e85b5451b29516d2d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Talk to Gemini with the Speech-to-Text API\n", + "\n", + "Having a spoken conversation with Gemini, Google's latest and most advanced model, is simple in a Colab notebook." + ], + "metadata": { + "id": "ZWhWniBGu3_Y" + } + }, + { + "cell_type": "code", + "source": [ + "#@title Install Google Cloud's speech library - you must click \"Restart Session\" when the button appears\n", + "\n", + "!pip install -q google-cloud-speech\n", + "from google.cloud import speech\n" + ], + "metadata": { + "cellView": "form", + "id": "OY_Xx59bf95N", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2557b9db-bcbd-4c48-94d8-9fec1191ddc5" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/274.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m30.7/274.5 kB\u001b[0m \u001b[31m901.0 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.8/274.5 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.5/274.5 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "[Required] Set up a Google Cloud account\n", + "\n", + "Okay so we get it, this part is hard, but in order to use the Cloud speech-to-text API you need to set up a Cloud account, project, and billing. Start [here](https://console.cloud.google.com/getting-started).\n", + "\n", + "Once you've done that, come back here." + ], + "metadata": { + "id": "ClJy_DX901bC" + } + }, + { + "cell_type": "code", + "source": [ + "#@title Authenticate with Google Cloud and your project ID\n", + "\n", + "from google.colab import auth\n", + "\n", + "gcp_project_id = '' # @param {type: \"string\"}\n", + "\n", + "auth.authenticate_user(project_id=gcp_project_id)" + ], + "metadata": { + "id": "_oO7-MlMpWd2", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title [Run once per project] Enable the Google Cloud speech-to-text API\n", + "\n", + "!gcloud services enable speech.googleapis.com" + ], + "metadata": { + "cellView": "form", + "id": "Rmn52Ol1YIDp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title Configure Gemini API key\n", + "\n", + "#Access your Gemini API key\n", + "\n", + "!pip install -q -U google-generativeai\n", + "import google.generativeai as genai\n", + "from google.colab import userdata\n", + "\n", + "gemini_api_secret_name = 'GOOGLE_API_KEY' # @param {type: \"string\"}\n", + "\n", + "try:\n", + " GOOGLE_API_KEY=userdata.get(gemini_api_secret_name)\n", + " genai.configure(api_key=GOOGLE_API_KEY)\n", + "except userdata.SecretNotFoundError as e:\n", + " print(f'Secret not found\\n\\nThis expects you to create a secret named {gemini_api_secret_name} in Colab\\n\\nVisit https://makersuite.google.com/app/apikey to create an API key\\n\\nStore that in the secrets section on the left side of the notebook (key icon)\\n\\nName the secret {gemini_api_secret_name}')\n", + " raise e\n", + "except userdata.NotebookAccessError as e:\n", + " print(f'You need to grant this notebook access to the {gemini_api_secret_name} secret in order for the notebook to access Gemini on your behalf.')\n", + " raise e\n", + "except Exception as e:\n", + " # unknown error\n", + " print(f\"There was an unknown error. Ensure you have a secret {gemini_api_secret_name} stored in Colab and it's a valid key from https://makersuite.google.com/app/apikey\")\n", + " raise e\n", + "\n", + "model = genai.GenerativeModel('gemini-pro')" + ], + "metadata": { + "id": "yFv1abRcv2P2", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "cellView": "form", + "outputId": "8c017038-b36b-4e96-863b-9cc6c8f14c62" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/146.9 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/146.9 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m146.9/146.9 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Setup\n", + "\n", + "# noting here that a lot of this code is forked from https://codelabs.developers.google.com/codelabs/cloud-speech-text-python3#0\n", + "\n", + "# set up cloud speech detection functions\n", + "\n", + "from google.cloud import speech\n", + "\n", + "def speech_to_text(\n", + " config: speech.RecognitionConfig,\n", + " audio: speech.RecognitionAudio,\n", + ") -> speech.RecognizeResponse:\n", + " client = speech.SpeechClient()\n", + "\n", + " # Synchronous speech recognition request\n", + " response = client.recognize(config=config, audio=audio)\n", + "\n", + " return response\n", + "\n", + "def print_response(response: speech.RecognizeResponse):\n", + " for result in response.results:\n", + " print_result(result)\n", + "\n", + "def print_result(result: speech.SpeechRecognitionResult):\n", + " best_alternative = result.alternatives[0]\n", + " print(\"-\" * 80)\n", + " print(f\"language_code: {result.language_code}\")\n", + " print(f\"transcript: {best_alternative.transcript}\")\n", + " print(f\"confidence: {best_alternative.confidence:.0%}\")\n", + "\n", + "# config for speech recognition; modify language here & other params\n", + "config = speech.RecognitionConfig(\n", + " language_code=\"en\",\n", + " enable_automatic_punctuation=True,\n", + ")\n", + "\n", + "# required set up to enable recording audio in your browser\n", + "\n", + "!pip install ipywebrtc\n", + "import io\n", + "from ipywebrtc import AudioRecorder, CameraStream\n", + "\n", + "# required in Colab to enable 3rd party widgets\n", + "from google.colab import output\n", + "output.enable_custom_widget_manager()\n", + "\n", + "# set up helper functions for displaying text nicely\n", + "\n", + "from IPython.display import Markdown\n", + "import textwrap\n", + "\n", + "def to_markdown(text):\n", + " text = text.replace('•', ' *')\n", + " return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "cellView": "form", + "id": "OJ3WhHy5faQl", + "outputId": "34a9fa0e-3426-45b2-99b4-b9a2db485747" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting ipywebrtc\n", + " Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/260.7 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.9/260.7 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m260.7/260.7 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: ipywebrtc\n", + "Successfully installed ipywebrtc-0.6.0\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Record your speech\n", + "\n", + "# create a microphone stream\n", + "camera = CameraStream(constraints={'audio': True, 'video':False})\n", + "\n", + "# create an audio recorder that uses the microphone stream\n", + "recorder = AudioRecorder(stream=camera)\n", + "\n", + "# display the recorder widget\n", + "recorder" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 107, + "referenced_widgets": [ + "fa2fe94c30c644deb009e4aac464c93a", + "b2753970c7554b2e8a6b3da182354c7f", + "6174abce27bf45f5ab1e57ac3efa6d46", + "202e08b79fbf48b59d7f65220da3a9e4", + "c09fa990067149ef892f09750c4c3224", + "b7ca2f71f73a414e85b5451b29516d2d" + ] + }, + "cellView": "form", + "id": "8sL9dR8Vfsf9", + "outputId": "0ba80158-dab5-4d2b-d233-0b407a2a8030" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "fa2fe94c30c644deb009e4aac464c93a" + } + }, + "metadata": { + "application/vnd.jupyter.widget-view+json": { + "colab": { + "custom_widget_manager": { + "url": "https://ssl.gstatic.com/colaboratory-static/widgets/colab-cdn-widget-manager/b3e629b1971e1542/manager.min.js" + } + } + } + } + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Transcribe and send to Gemini\n", + "\n", + "recorded_audio = recorder.audio.value\n", + "\n", + "# if you ever want to save the output, uncomment the next two lines\n", + "#with open(\"output.wav\", \"wb\") as f:\n", + "# f.write(recorder.audio.value)\n", + "\n", + "audio = speech.RecognitionAudio(\n", + " content=recorded_audio,\n", + ")\n", + "\n", + "processing_results = speech_to_text(config, audio)\n", + "audio_text = processing_results.results[0].alternatives[0].transcript\n", + "\n", + "response = model.generate_content(audio_text)\n", + "\n", + "to_markdown(f'**You**: {audio_text}\\n\\n**Gemini**:\\n{response.text}')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 407 + }, + "cellView": "form", + "id": "XSaKEGP_lxF2", + "outputId": "264180f1-83a8-4b94-d841-b627fb6af7e2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "> **You**: Can you compose a sketch for Saturday Night Live that includes corgis and Keanu Reeves?\n> \n> **Gemini**:\n> Title: Keanu Reeves and the Corgi Kingdom\n> \n> [Scene: A magical forest. Keanu Reeves is walking through the forest, dressed in a wizard's robe.]\n> \n> Keanu Reeves: (to himself) I am Keanu Reeves, the Great Wizard of Corgis. I must find the lost kingdom of the corgis.\n> \n> [Keanu continues walking and comes across a group of corgis playing in a clearing.]\n> \n> Keanu Reeves: (excited) Corgis!\n> \n> [The corgis stop playing and look at Keanu.]\n> \n> Keanu Reeves: I am here to help you. I will lead you to your lost kingdom.\n> \n> [The corgis bark happily and start following Keanu.]\n> \n> [Keanu and the corgis walk through the forest, encountering various obstacles along the way. They are attacked by a pack of wolves, but Keanu uses his magic to defeat them.]\n> \n> [Finally, they reach the lost kingdom of the corgis. The corgis are overjoyed and celebrate Keanu's arrival.]\n> \n> Corgi King: (bowing to Keanu) Thank you, Great Wizard of Corgis. You have saved our kingdom.\n> \n> Keanu Reeves: (smiling) You're welcome, Corgi King. I am glad I could help.\n> \n> [Keanu and the corgis live happily ever after in the lost kingdom.]\n> \n> [End Scene]" + }, + "metadata": {}, + "execution_count": 7 + } + ] + } + ] +} \ No newline at end of file