fix(openai): Include audio messages in request (#7015)

langchain-ai · Oct 18, 2024 · 7671f00 · 7671f00
1 parent 391bc46
commit 7671f00
Show file tree

Hide file tree

Showing 7 changed files with 506 additions and 9 deletions.
diff --git a/docs/core_docs/docs/integrations/chat/openai.ipynb b/docs/core_docs/docs/integrations/chat/openai.ipynb
@@ -1028,6 +1028,159 @@
     "console.log(\"USAGE:\", resWitCaching.response_metadata.usage);"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "cc8b3c94",
+   "metadata": {},
+   "source": [
+    "## Audio output\n",
+    "\n",
+    "Some OpenAI models (such as `gpt-4o-audio-preview`) support generating audio output. This example shows how to use that feature:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b4d579b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  id: 'audio_67129e9466f48190be70372922464162',\n",
+      "  data: 'UklGRgZ4BABXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAATElTVBoAAABJTkZPSVNGVA4AAABMYXZmNTguMjkuMTAwAGRhdGHA',\n",
+      "  expires_at: 1729277092,\n",
+      "  transcript: \"Why did the cat sit on the computer's keyboard? Because it wanted to keep an eye on the mouse!\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import { ChatOpenAI } from \"@langchain/openai\";\n",
+    "\n",
+    "const modelWithAudioOutput = new ChatOpenAI({\n",
+    "  model: \"gpt-4o-audio-preview\",\n",
+    "  // You may also pass these fields to `.bind` as a call argument.\n",
+    "  modalities: [\"text\", \"audio\"], // Specifies that the model should output audio.\n",
+    "  audio: {\n",
+    "    voice: \"alloy\",\n",
+    "    format: \"wav\",\n",
+    "  },\n",
+    "});\n",
+    "\n",
+    "const audioOutputResult = await modelWithAudioOutput.invoke(\"Tell me a joke about cats.\");\n",
+    "const castAudioContent = audioOutputResult.additional_kwargs.audio as Record<string, any>;\n",
+    "\n",
+    "console.log({\n",
+    "  ...castAudioContent,\n",
+    "  data: castAudioContent.data.slice(0, 100) // Sliced for brevity\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bfea3608",
+   "metadata": {},
+   "source": [
+    "We see that the audio data is returned inside the `data` field. We are also provided an `expires_at` date field. This field represents the date the audio response will no longer be accessible on the server for use in multi-turn conversations.\n",
+    "\n",
+    "### Streaming Audio Output\n",
+    "\n",
+    "OpenAI also supports streaming audio output. Here's an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0fa68183",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  id: 'audio_67129e976ce081908103ba4947399a3eaudio_67129e976ce081908103ba4947399a3e',\n",
+      "  transcript: 'Why was the cat sitting on the computer? Because it wanted to keep an eye on the mouse!',\n",
+      "  index: 0,\n",
+      "  data: 'CgAGAAIADAAAAA0AAwAJAAcACQAJAAQABQABAAgABQAPAAAACAADAAUAAwD8/wUA+f8MAPv/CAD7/wUA///8/wUA/f8DAPj/AgD6',\n",
+      "  expires_at: 1729277096\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import { AIMessageChunk } from \"@langchain/core/messages\";\n",
+    "import { concat } from \"@langchain/core/utils/stream\"\n",
+    "import { ChatOpenAI } from \"@langchain/openai\";\n",
+    "\n",
+    "const modelWithStreamingAudioOutput = new ChatOpenAI({\n",
+    "  model: \"gpt-4o-audio-preview\",\n",
+    "  modalities: [\"text\", \"audio\"],\n",
+    "  audio: {\n",
+    "    voice: \"alloy\",\n",
+    "    format: \"pcm16\", // Format must be `pcm16` for streaming\n",
+    "  },\n",
+    "});\n",
+    "\n",
+    "const audioOutputStream = await modelWithStreamingAudioOutput.stream(\"Tell me a joke about cats.\");\n",
+    "let finalAudioOutputMsg: AIMessageChunk | undefined;\n",
+    "for await (const chunk of audioOutputStream) {\n",
+    "  finalAudioOutputMsg = finalAudioOutputMsg ? concat(finalAudioOutputMsg, chunk) : chunk;\n",
+    "}\n",
+    "const castStreamedAudioContent = finalAudioOutputMsg?.additional_kwargs.audio as Record<string, any>;\n",
+    "\n",
+    "console.log({\n",
+    "  ...castStreamedAudioContent,\n",
+    "  data: castStreamedAudioContent.data.slice(0, 100) // Sliced for brevity\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8b84aac",
+   "metadata": {},
+   "source": [
+    "### Audio input\n",
+    "\n",
+    "These models also support passing audio as input. For this, you must specify `input_audio` fields as seen below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1a69dad8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "That's a great joke! It's always fun to imagine why cats do the funny things they do. Keeping an eye on the \"mouse\" is a creatively punny way to describe it!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import { HumanMessage } from \"@langchain/core/messages\";\n",
+    "\n",
+    "const userInput = new HumanMessage({\n",
+    "  content: [{\n",
+    "    type: \"input_audio\",\n",
+    "    input_audio: {\n",
+    "      data: castAudioContent.data, // Re-use the base64 data from the first example\n",
+    "      format: \"wav\",\n",
+    "    },\n",
+    "  }]\n",
+    "})\n",
+    "\n",
+    "// Re-use the same model instance\n",
+    "const userInputAudioRes = await modelWithAudioOutput.invoke([userInput]);\n",
+    "\n",
+    "console.log((userInputAudioRes.additional_kwargs.audio as Record<string, any>).transcript);"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3a5bb5ca-c3ae-4a58-be67-2cd18574b9a3",

diff --git a/libs/langchain-openai/audio.json b/libs/langchain-openai/audio.json
diff --git a/libs/langchain-openai/package.json b/libs/langchain-openai/package.json
@@ -36,7 +36,7 @@
   "license": "MIT",
   "dependencies": {
     "js-tiktoken": "^1.0.12",
-    "openai": "^4.67.2",
+    "openai": "^4.68.0",
     "zod": "^3.22.4",
     "zod-to-json-schema": "^3.22.3"
   },

diff --git a/libs/langchain-openai/src/chat_models.ts b/libs/langchain-openai/src/chat_models.ts
@@ -174,6 +174,11 @@ function openAIResponseToChatMessage(
           system_fingerprint: rawResponse.system_fingerprint,
         };
       }
+
+      if (message.audio) {
+        additional_kwargs.audio = message.audio;
+      }
+
       return new AIMessage({
         content: message.content || "",
         tool_calls: toolCalls,
@@ -212,6 +217,14 @@ function _convertDeltaToMessageChunk(
   if (includeRawResponse) {
     additional_kwargs.__raw_response = rawResponse;
   }
+
+  if (delta.audio) {
+    additional_kwargs.audio = {
+      ...delta.audio,
+      index: rawResponse.choices[0].index,
+    };
+  }
+
   const response_metadata = { usage: { ...rawResponse.usage } };
   if (role === "user") {
     return new HumanMessageChunk({ content, response_metadata });
@@ -257,9 +270,11 @@ function _convertDeltaToMessageChunk(
 }
 
 // Used in LangSmith, export is important here
-export function _convertMessagesToOpenAIParams(messages: BaseMessage[]) {
+export function _convertMessagesToOpenAIParams(
+  messages: BaseMessage[]
+): OpenAICompletionParam[] {
   // TODO: Function messages do not support array content, fix cast
-  return messages.map((message) => {
+  return messages.flatMap((message) => {
     // eslint-disable-next-line @typescript-eslint/no-explicit-any
     const completionParam: Record<string, any> = {
       role: messageToOpenAIRole(message),
@@ -285,6 +300,21 @@ export function _convertMessagesToOpenAIParams(messages: BaseMessage[]) {
         completionParam.tool_call_id = (message as ToolMessage).tool_call_id;
       }
     }
+
+    if (
+      message.additional_kwargs.audio &&
+      typeof message.additional_kwargs.audio === "object" &&
+      "id" in message.additional_kwargs.audio
+    ) {
+      const audioMessage = {
+        role: "assistant",
+        audio: {
+          id: message.additional_kwargs.audio.id,
+        },
+      };
+      return [completionParam, audioMessage] as OpenAICompletionParam[];
+    }
+
     return completionParam as OpenAICompletionParam;
   });
 }
@@ -372,6 +402,27 @@ export interface ChatOpenAICallOptions
    * @version 0.2.6
    */
   strict?: boolean;
+
+  /**
+   * Output types that you would like the model to generate for this request. Most
+   * models are capable of generating text, which is the default:
+   *
+   * `["text"]`
+   *
+   * The `gpt-4o-audio-preview` model can also be used to
+   * [generate audio](https://platform.openai.com/docs/guides/audio). To request that
+   * this model generate both text and audio responses, you can use:
+   *
+   * `["text", "audio"]`
+   */
+  modalities?: Array<OpenAIClient.Chat.ChatCompletionModality>;
+
+  /**
+   * Parameters for audio output. Required when audio output is requested with
+   * `modalities: ["audio"]`.
+   * [Learn more](https://platform.openai.com/docs/guides/audio).
+   */
+  audio?: OpenAIClient.Chat.ChatCompletionAudioParam;
 }
 
 export interface ChatOpenAIFields
@@ -842,6 +893,80 @@ export interface ChatOpenAIFields
  * </details>
  *
  * <br />
+ *
+ * <details>
+ * <summary><strong>Audio Outputs</strong></summary>
+ *
+ * ```typescript
+ * import { ChatOpenAI } from "@langchain/openai";
+ *
+ * const modelWithAudioOutput = new ChatOpenAI({
+ *   model: "gpt-4o-audio-preview",
+ *   // You may also pass these fields to `.bind` as a call argument.
+ *   modalities: ["text", "audio"], // Specifies that the model should output audio.
+ *   audio: {
+ *     voice: "alloy",
+ *     format: "wav",
+ *   },
+ * });
+ *
+ * const audioOutputResult = await modelWithAudioOutput.invoke("Tell me a joke about cats.");
+ * const castMessageContent = audioOutputResult.content[0] as Record<string, any>;
+ *
+ * console.log({
+ *   ...castMessageContent,
+ *   data: castMessageContent.data.slice(0, 100) // Sliced for brevity
+ * })
+ * ```
+ *
+ * ```txt
+ * {
+ *   id: 'audio_67117718c6008190a3afad3e3054b9b6',
+ *   data: 'UklGRqYwBgBXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAATElTVBoAAABJTkZPSVNGVA4AAABMYXZmNTguMjkuMTAwAGRhdGFg',
+ *   expires_at: 1729201448,
+ *   transcript: 'Sure! Why did the cat sit on the computer? Because it wanted to keep an eye on the mouse!'
+ * }
+ * ```
+ * </details>
+ *
+ * <br />
+ *
+ * <details>
+ * <summary><strong>Audio Outputs</strong></summary>
+ *
+ * ```typescript
+ * import { ChatOpenAI } from "@langchain/openai";
+ *
+ * const modelWithAudioOutput = new ChatOpenAI({
+ *   model: "gpt-4o-audio-preview",
+ *   // You may also pass these fields to `.bind` as a call argument.
+ *   modalities: ["text", "audio"], // Specifies that the model should output audio.
+ *   audio: {
+ *     voice: "alloy",
+ *     format: "wav",
+ *   },
+ * });
+ *
+ * const audioOutputResult = await modelWithAudioOutput.invoke("Tell me a joke about cats.");
+ * const castAudioContent = audioOutputResult.additional_kwargs.audio as Record<string, any>;
+ *
+ * console.log({
+ *   ...castAudioContent,
+ *   data: castAudioContent.data.slice(0, 100) // Sliced for brevity
+ * })
+ * ```
+ *
+ * ```txt
+ * {
+ *   id: 'audio_67117718c6008190a3afad3e3054b9b6',
+ *   data: 'UklGRqYwBgBXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAATElTVBoAAABJTkZPSVNGVA4AAABMYXZmNTguMjkuMTAwAGRhdGFg',
+ *   expires_at: 1729201448,
+ *   transcript: 'Sure! Why did the cat sit on the computer? Because it wanted to keep an eye on the mouse!'
+ * }
+ * ```
+ * </details>
+ *
+ * <br />
  */
 export class ChatOpenAI<
     CallOptions extends ChatOpenAICallOptions = ChatOpenAICallOptions
@@ -958,6 +1083,10 @@ export class ChatOpenAI<
    */
   supportsStrictToolCalling?: boolean;
 
+  audio?: OpenAIClient.Chat.ChatCompletionAudioParam;
+
+  modalities?: Array<OpenAIClient.Chat.ChatCompletionModality>;
+
   constructor(
     fields?: ChatOpenAIFields,
     /** @deprecated */
@@ -1026,6 +1155,8 @@ export class ChatOpenAI<
     this.stopSequences = this?.stop;
     this.user = fields?.user;
     this.__includeRawResponse = fields?.__includeRawResponse;
+    this.audio = fields?.audio;
+    this.modalities = fields?.modalities;
 
     if (this.azureOpenAIApiKey || this.azureADTokenProvider) {
       if (
@@ -1190,6 +1321,12 @@ export class ChatOpenAI<
       seed: options?.seed,
       ...streamOptionsConfig,
       parallel_tool_calls: options?.parallel_tool_calls,
+      ...(this.audio || options?.audio
+        ? { audio: this.audio || options?.audio }
+        : {}),
+      ...(this.modalities || options?.modalities
+        ? { modalities: this.modalities || options?.modalities }
+        : {}),
       ...this.modelKwargs,
     };
     return params;
@@ -1241,7 +1378,7 @@ export class ChatOpenAI<
     const streamIterable = await this.completionWithRetry(params, options);
     let usage: OpenAIClient.Completions.CompletionUsage | undefined;
     for await (const data of streamIterable) {
-      const choice = data?.choices[0];
+      const choice = data?.choices?.[0];
       if (data.usage) {
         usage = data.usage;
       }