From 58e245444e386c6bc690e0efdbcdd3d76a8476f4 Mon Sep 17 00:00:00 2001
From: Google Colaboratory Team <colaboratory-team@google.com>
Date: Thu, 21 Dec 2023 13:33:58 -0800
Subject: [PATCH] No public description

PiperOrigin-RevId: 592938883
---
 ...ini_with_Google's_Speech_to_Text_API.ipynb | 559 ++++++++++++++++++
 1 file changed, 559 insertions(+)
 create mode 100644 notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb

diff --git a/notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb b/notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb
new file mode 100644
index 00000000..1193ae8f
--- /dev/null
+++ b/notebooks/Talk_to_Gemini_with_Google's_Speech_to_Text_API.ipynb
@@ -0,0 +1,559 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "fa2fe94c30c644deb009e4aac464c93a": {
+          "model_module": "jupyter-webrtc",
+          "model_name": "AudioRecorderModel",
+          "model_module_version": "~0.6.0",
+          "state": {
+            "_data_src": "blob:https://5t7fh9enxfp-496ff2e9c6d22116-0-colab.googleusercontent.com/c0619ff6-ae72-4ab9-9908-623b31d805b5",
+            "_dom_classes": [],
+            "_model_module": "jupyter-webrtc",
+            "_model_module_version": "~0.6.0",
+            "_model_name": "AudioRecorderModel",
+            "_view_count": null,
+            "_view_module": "jupyter-webrtc",
+            "_view_module_version": "~0.6.0",
+            "_view_name": "AudioRecorderView",
+            "audio": "IPY_MODEL_b2753970c7554b2e8a6b3da182354c7f",
+            "autosave": false,
+            "codecs": "",
+            "filename": "record",
+            "format": "webm",
+            "layout": "IPY_MODEL_6174abce27bf45f5ab1e57ac3efa6d46",
+            "recording": false,
+            "stream": "IPY_MODEL_202e08b79fbf48b59d7f65220da3a9e4"
+          }
+        },
+        "b2753970c7554b2e8a6b3da182354c7f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "AudioModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "AudioModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "AudioView",
+            "autoplay": true,
+            "controls": true,
+            "format": "webm",
+            "layout": "IPY_MODEL_c09fa990067149ef892f09750c4c3224",
+            "loop": true
+          }
+        },
+        "6174abce27bf45f5ab1e57ac3efa6d46": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "202e08b79fbf48b59d7f65220da3a9e4": {
+          "model_module": "jupyter-webrtc",
+          "model_name": "CameraStreamModel",
+          "model_module_version": "~0.6.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "jupyter-webrtc",
+            "_model_module_version": "~0.6.0",
+            "_model_name": "CameraStreamModel",
+            "_view_count": null,
+            "_view_module": "jupyter-webrtc",
+            "_view_module_version": "~0.6.0",
+            "_view_name": "MediaStreamView",
+            "constraints": {
+              "audio": true,
+              "video": false
+            },
+            "layout": "IPY_MODEL_b7ca2f71f73a414e85b5451b29516d2d"
+          }
+        },
+        "c09fa990067149ef892f09750c4c3224": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b7ca2f71f73a414e85b5451b29516d2d": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "<b>Talk to Gemini with the Speech-to-Text API</b>\n",
+        "\n",
+        "Having a spoken conversation with Gemini, Google's latest and most advanced model, is simple in a Colab notebook."
+      ],
+      "metadata": {
+        "id": "ZWhWniBGu3_Y"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title Install Google Cloud's speech library - you must click \"Restart Session\" when the button appears\n",
+        "\n",
+        "!pip install -q google-cloud-speech\n",
+        "from google.cloud import speech\n"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "OY_Xx59bf95N",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "2557b9db-bcbd-4c48-94d8-9fec1191ddc5"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/274.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m30.7/274.5 kB\u001b[0m \u001b[31m901.0 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.8/274.5 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.5/274.5 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "<b>[Required] Set up a Google Cloud account</b>\n",
+        "\n",
+        "Okay so we get it, this part is hard, but in order to use the Cloud speech-to-text API you need to set up a Cloud account, project, and billing. Start [here](https://console.cloud.google.com/getting-started).\n",
+        "\n",
+        "Once you've done that, come back here."
+      ],
+      "metadata": {
+        "id": "ClJy_DX901bC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title Authenticate with Google Cloud and your project ID\n",
+        "\n",
+        "from google.colab import auth\n",
+        "\n",
+        "gcp_project_id = '' # @param {type: \"string\"}\n",
+        "\n",
+        "auth.authenticate_user(project_id=gcp_project_id)"
+      ],
+      "metadata": {
+        "id": "_oO7-MlMpWd2",
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title [Run once per project] Enable the Google Cloud speech-to-text API\n",
+        "\n",
+        "!gcloud services enable speech.googleapis.com"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "Rmn52Ol1YIDp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title Configure Gemini API key\n",
+        "\n",
+        "#Access your Gemini API key\n",
+        "\n",
+        "!pip install -q -U google-generativeai\n",
+        "import google.generativeai as genai\n",
+        "from google.colab import userdata\n",
+        "\n",
+        "gemini_api_secret_name = 'GOOGLE_API_KEY'  # @param {type: \"string\"}\n",
+        "\n",
+        "try:\n",
+        "  GOOGLE_API_KEY=userdata.get(gemini_api_secret_name)\n",
+        "  genai.configure(api_key=GOOGLE_API_KEY)\n",
+        "except userdata.SecretNotFoundError as e:\n",
+        "   print(f'Secret not found\\n\\nThis expects you to create a secret named {gemini_api_secret_name} in Colab\\n\\nVisit https://makersuite.google.com/app/apikey to create an API key\\n\\nStore that in the secrets section on the left side of the notebook (key icon)\\n\\nName the secret {gemini_api_secret_name}')\n",
+        "   raise e\n",
+        "except userdata.NotebookAccessError as e:\n",
+        "  print(f'You need to grant this notebook access to the {gemini_api_secret_name} secret in order for the notebook to access Gemini on your behalf.')\n",
+        "  raise e\n",
+        "except Exception as e:\n",
+        "  # unknown error\n",
+        "  print(f\"There was an unknown error. Ensure you have a secret {gemini_api_secret_name} stored in Colab and it's a valid key from https://makersuite.google.com/app/apikey\")\n",
+        "  raise e\n",
+        "\n",
+        "model = genai.GenerativeModel('gemini-pro')"
+      ],
+      "metadata": {
+        "id": "yFv1abRcv2P2",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "cellView": "form",
+        "outputId": "8c017038-b36b-4e96-863b-9cc6c8f14c62"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/146.9 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/146.9 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m146.9/146.9 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title Setup\n",
+        "\n",
+        "# noting here that a lot of this code is forked from https://codelabs.developers.google.com/codelabs/cloud-speech-text-python3#0\n",
+        "\n",
+        "# set up cloud speech detection functions\n",
+        "\n",
+        "from google.cloud import speech\n",
+        "\n",
+        "def speech_to_text(\n",
+        "    config: speech.RecognitionConfig,\n",
+        "    audio: speech.RecognitionAudio,\n",
+        ") -> speech.RecognizeResponse:\n",
+        "    client = speech.SpeechClient()\n",
+        "\n",
+        "    # Synchronous speech recognition request\n",
+        "    response = client.recognize(config=config, audio=audio)\n",
+        "\n",
+        "    return response\n",
+        "\n",
+        "def print_response(response: speech.RecognizeResponse):\n",
+        "    for result in response.results:\n",
+        "        print_result(result)\n",
+        "\n",
+        "def print_result(result: speech.SpeechRecognitionResult):\n",
+        "    best_alternative = result.alternatives[0]\n",
+        "    print(\"-\" * 80)\n",
+        "    print(f\"language_code: {result.language_code}\")\n",
+        "    print(f\"transcript:    {best_alternative.transcript}\")\n",
+        "    print(f\"confidence:    {best_alternative.confidence:.0%}\")\n",
+        "\n",
+        "# config for speech recognition; modify language here & other params\n",
+        "config = speech.RecognitionConfig(\n",
+        "    language_code=\"en\",\n",
+        "    enable_automatic_punctuation=True,\n",
+        ")\n",
+        "\n",
+        "# required set up to enable recording audio in your browser\n",
+        "\n",
+        "!pip install ipywebrtc\n",
+        "import io\n",
+        "from ipywebrtc import AudioRecorder, CameraStream\n",
+        "\n",
+        "# required in Colab to enable 3rd party widgets\n",
+        "from google.colab import output\n",
+        "output.enable_custom_widget_manager()\n",
+        "\n",
+        "# set up helper functions for displaying text nicely\n",
+        "\n",
+        "from IPython.display import Markdown\n",
+        "import textwrap\n",
+        "\n",
+        "def to_markdown(text):\n",
+        "  text = text.replace('•', '  *')\n",
+        "  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "cellView": "form",
+        "id": "OJ3WhHy5faQl",
+        "outputId": "34a9fa0e-3426-45b2-99b4-b9a2db485747"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting ipywebrtc\n",
+            "  Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)\n",
+            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/260.7 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.9/260.7 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m260.7/260.7 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: ipywebrtc\n",
+            "Successfully installed ipywebrtc-0.6.0\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title Record your speech\n",
+        "\n",
+        "# create a microphone stream\n",
+        "camera = CameraStream(constraints={'audio': True, 'video':False})\n",
+        "\n",
+        "# create an audio recorder that uses the microphone stream\n",
+        "recorder = AudioRecorder(stream=camera)\n",
+        "\n",
+        "# display the recorder widget\n",
+        "recorder"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 107,
+          "referenced_widgets": [
+            "fa2fe94c30c644deb009e4aac464c93a",
+            "b2753970c7554b2e8a6b3da182354c7f",
+            "6174abce27bf45f5ab1e57ac3efa6d46",
+            "202e08b79fbf48b59d7f65220da3a9e4",
+            "c09fa990067149ef892f09750c4c3224",
+            "b7ca2f71f73a414e85b5451b29516d2d"
+          ]
+        },
+        "cellView": "form",
+        "id": "8sL9dR8Vfsf9",
+        "outputId": "0ba80158-dab5-4d2b-d233-0b407a2a8030"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "fa2fe94c30c644deb009e4aac464c93a"
+            }
+          },
+          "metadata": {
+            "application/vnd.jupyter.widget-view+json": {
+              "colab": {
+                "custom_widget_manager": {
+                  "url": "https://ssl.gstatic.com/colaboratory-static/widgets/colab-cdn-widget-manager/b3e629b1971e1542/manager.min.js"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title Transcribe and send to Gemini\n",
+        "\n",
+        "recorded_audio = recorder.audio.value\n",
+        "\n",
+        "# if you ever want to save the output, uncomment the next two lines\n",
+        "#with open(\"output.wav\", \"wb\") as f:\n",
+        "#    f.write(recorder.audio.value)\n",
+        "\n",
+        "audio = speech.RecognitionAudio(\n",
+        "    content=recorded_audio,\n",
+        ")\n",
+        "\n",
+        "processing_results = speech_to_text(config, audio)\n",
+        "audio_text = processing_results.results[0].alternatives[0].transcript\n",
+        "\n",
+        "response = model.generate_content(audio_text)\n",
+        "\n",
+        "to_markdown(f'**You**: {audio_text}\\n\\n**Gemini**:\\n{response.text}')"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 407
+        },
+        "cellView": "form",
+        "id": "XSaKEGP_lxF2",
+        "outputId": "264180f1-83a8-4b94-d841-b627fb6af7e2"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.Markdown object>"
+            ],
+            "text/markdown": "> **You**: Can you compose a sketch for Saturday Night Live that includes corgis and Keanu Reeves?\n> \n> **Gemini**:\n> Title: Keanu Reeves and the Corgi Kingdom\n> \n> [Scene: A magical forest. Keanu Reeves is walking through the forest, dressed in a wizard's robe.]\n> \n> Keanu Reeves: (to himself) I am Keanu Reeves, the Great Wizard of Corgis. I must find the lost kingdom of the corgis.\n> \n> [Keanu continues walking and comes across a group of corgis playing in a clearing.]\n> \n> Keanu Reeves: (excited) Corgis!\n> \n> [The corgis stop playing and look at Keanu.]\n> \n> Keanu Reeves: I am here to help you. I will lead you to your lost kingdom.\n> \n> [The corgis bark happily and start following Keanu.]\n> \n> [Keanu and the corgis walk through the forest, encountering various obstacles along the way. They are attacked by a pack of wolves, but Keanu uses his magic to defeat them.]\n> \n> [Finally, they reach the lost kingdom of the corgis. The corgis are overjoyed and celebrate Keanu's arrival.]\n> \n> Corgi King: (bowing to Keanu) Thank you, Great Wizard of Corgis. You have saved our kingdom.\n> \n> Keanu Reeves: (smiling) You're welcome, Corgi King. I am glad I could help.\n> \n> [Keanu and the corgis live happily ever after in the lost kingdom.]\n> \n> [End Scene]"
+          },
+          "metadata": {},
+          "execution_count": 7
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file