diff --git a/app/client/api.ts b/app/client/api.ts index 94296b9aa81..8285b4d9f94 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -25,6 +25,7 @@ export const ROLES = ["system", "user", "assistant"] as const; export type MessageRole = (typeof ROLES)[number]; export const Models = ["gpt-3.5-turbo", "gpt-4"] as const; +export const TTSModels = ["tts-1", "tts-1-hd"] as const; export type ChatModel = ModelType; export interface MultimodalContent { @@ -53,6 +54,15 @@ export interface LLMConfig { style?: DalleRequestPayload["style"]; } +export interface SpeechOptions { + model: string; + input: string; + voice: string; + response_format?: string; + speed?: number; + onController?: (controller: AbortController) => void; +} + export interface ChatOptions { messages: RequestMessage[]; config: LLMConfig; @@ -87,6 +97,7 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; + abstract speech(options: SpeechOptions): Promise; abstract usage(): Promise; abstract models(): Promise; } @@ -205,13 +216,16 @@ export function validString(x: string): boolean { return x?.length > 0; } -export function getHeaders() { +export function getHeaders(ignoreHeaders: boolean = false) { const accessStore = useAccessStore.getState(); const chatStore = useChatStore.getState(); - const headers: Record = { - "Content-Type": "application/json", - Accept: "application/json", - }; + let headers: Record = {}; + if (!ignoreHeaders) { + headers = { + "Content-Type": "application/json", + Accept: "application/json", + }; + } const clientConfig = getClientConfig(); diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index d5fa3042fc1..4ade9ebb98f 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -12,6 +12,7 @@ import { getHeaders, LLMApi, LLMModel, + SpeechOptions, MultimodalContent, } from "../api"; import Locale from "../../locales"; @@ -83,6 +84,10 @@ export class QwenApi implements LLMApi { return res?.output?.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ role: v.role, diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts index df128c70497..7826838a61e 100644 --- a/app/client/platforms/anthropic.ts +++ b/app/client/platforms/anthropic.ts @@ -1,5 +1,5 @@ import { Anthropic, ApiPath } from "@/app/constant"; -import { ChatOptions, getHeaders, LLMApi } from "../api"; +import { ChatOptions, getHeaders, LLMApi, SpeechOptions } from "../api"; import { useAccessStore, useAppConfig, @@ -73,6 +73,10 @@ const ClaudeMapper = { const keys = ["claude-2, claude-instant-1"]; export class ClaudeApi implements LLMApi { + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + extractMessage(res: any) { console.log("[Response] claude response: ", res); diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts index 3be147f4985..c360417c602 100644 --- a/app/client/platforms/baidu.ts +++ b/app/client/platforms/baidu.ts @@ -14,6 +14,7 @@ import { LLMApi, LLMModel, MultimodalContent, + SpeechOptions, } from "../api"; import Locale from "../../locales"; import { @@ -75,6 +76,10 @@ export class ErnieApi implements LLMApi { return [baseUrl, path].join("/"); } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function", diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts index 7677cafe12b..a6e2d426ee3 100644 --- a/app/client/platforms/bytedance.ts +++ b/app/client/platforms/bytedance.ts @@ -13,6 +13,7 @@ import { LLMApi, LLMModel, MultimodalContent, + SpeechOptions, } from "../api"; import Locale from "../../locales"; import { @@ -77,6 +78,10 @@ export class DoubaoApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ role: v.role, diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index 12d8846357a..ecb5ce44b57 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -1,5 +1,12 @@ import { ApiPath, Google, REQUEST_TIMEOUT_MS } from "@/app/constant"; -import { ChatOptions, getHeaders, LLMApi, LLMModel, LLMUsage } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + LLMModel, + LLMUsage, + SpeechOptions, +} from "../api"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; import { getClientConfig } from "@/app/config/client"; import { DEFAULT_API_HOST } from "@/app/constant"; @@ -56,6 +63,10 @@ export class GeminiProApi implements LLMApi { "" ); } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions): Promise { const apiClient = this; let multimodal = false; diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts index e29b603e2dc..3931672e661 100644 --- a/app/client/platforms/iflytek.ts +++ b/app/client/platforms/iflytek.ts @@ -7,7 +7,13 @@ import { } from "@/app/constant"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; -import { ChatOptions, getHeaders, LLMApi, LLMModel } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + LLMModel, + SpeechOptions, +} from "../api"; import Locale from "../../locales"; import { EventStreamContentType, @@ -53,6 +59,10 @@ export class SparkApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; for (const v of options.messages) { diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts index d09c4619edf..6b197974571 100644 --- a/app/client/platforms/moonshot.ts +++ b/app/client/platforms/moonshot.ts @@ -14,7 +14,13 @@ import { usePluginStore, } from "@/app/store"; import { stream } from "@/app/utils/chat"; -import { ChatOptions, getHeaders, LLMApi, LLMModel } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + LLMModel, + SpeechOptions, +} from "../api"; import { getClientConfig } from "@/app/config/client"; import { getMessageTextContent } from "@/app/utils"; import { RequestPayload } from "./openai"; @@ -53,6 +59,10 @@ export class MoonshotApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; for (const v of options.messages) { diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts index 4f9dcd4d048..d86be718bce 100644 --- a/app/client/platforms/openai.ts +++ b/app/client/platforms/openai.ts @@ -33,6 +33,7 @@ import { LLMModel, LLMUsage, MultimodalContent, + SpeechOptions, } from "../api"; import Locale from "../../locales"; import { getClientConfig } from "@/app/config/client"; @@ -141,6 +142,44 @@ export class ChatGPTApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? res; } + async speech(options: SpeechOptions): Promise { + const requestPayload = { + model: options.model, + input: options.input, + voice: options.voice, + response_format: options.response_format, + speed: options.speed, + }; + + console.log("[Request] openai speech payload: ", requestPayload); + + const controller = new AbortController(); + options.onController?.(controller); + + try { + const speechPath = this.path(OpenaiPath.SpeechPath); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: getHeaders(), + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + + const res = await fetch(speechPath, speechPayload); + clearTimeout(requestTimeoutId); + return await res.arrayBuffer(); + } catch (e) { + console.log("[Request] failed to make a speech request", e); + throw e; + } + } + async chat(options: ChatOptions) { const modelConfig = { ...useAppConfig.getState().modelConfig, diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts index 579008a9b9d..3e8f1a45957 100644 --- a/app/client/platforms/tencent.ts +++ b/app/client/platforms/tencent.ts @@ -8,6 +8,7 @@ import { LLMApi, LLMModel, MultimodalContent, + SpeechOptions, } from "../api"; import Locale from "../../locales"; import { @@ -89,6 +90,10 @@ export class HunyuanApi implements LLMApi { return res.Choices?.at(0)?.Message?.Content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const visionModel = isVisionModel(options.config.model); const messages = options.messages.map((v, index) => ({ diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 125f265eaad..195afb72f25 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -15,6 +15,8 @@ import RenameIcon from "../icons/rename.svg"; import ExportIcon from "../icons/share.svg"; import ReturnIcon from "../icons/return.svg"; import CopyIcon from "../icons/copy.svg"; +import SpeakIcon from "../icons/speak.svg"; +import SpeakStopIcon from "../icons/speak-stop.svg"; import LoadingIcon from "../icons/three-dots.svg"; import LoadingButtonIcon from "../icons/loading.svg"; import PromptIcon from "../icons/prompt.svg"; @@ -96,6 +98,8 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, + DEFAULT_TTS_ENGINE, + ModelProvider, Path, REQUEST_TIMEOUT_MS, UNFINISHED_INPUT, @@ -112,6 +116,11 @@ import { useAllModels } from "../utils/hooks"; import { MultimodalContent } from "../client/api"; const localStorage = safeLocalStorage(); +import { ClientApi } from "../client/api"; +import { createTTSPlayer } from "../utils/audio"; +import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; + +const ttsPlayer = createTTSPlayer(); const Markdown = dynamic(async () => (await import("./markdown")).Markdown, { loading: () => , @@ -442,6 +451,7 @@ export function ChatActions(props: { hitBottom: boolean; uploading: boolean; setShowShortcutKeyModal: React.Dispatch>; + setUserInput: (input: string) => void; }) { const config = useAppConfig(); const navigate = useNavigate(); @@ -1183,10 +1193,55 @@ function _Chat() { }); }; + const accessStore = useAccessStore(); + const [speechStatus, setSpeechStatus] = useState(false); + const [speechLoading, setSpeechLoading] = useState(false); + async function openaiSpeech(text: string) { + if (speechStatus) { + ttsPlayer.stop(); + setSpeechStatus(false); + } else { + var api: ClientApi; + api = new ClientApi(ModelProvider.GPT); + const config = useAppConfig.getState(); + setSpeechLoading(true); + ttsPlayer.init(); + let audioBuffer: ArrayBuffer; + const { markdownToTxt } = require("markdown-to-txt"); + const textContent = markdownToTxt(text); + if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) { + const edgeVoiceName = accessStore.edgeVoiceName(); + const tts = new MsEdgeTTS(); + await tts.setMetadata( + edgeVoiceName, + OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, + ); + audioBuffer = await tts.toArrayBuffer(textContent); + } else { + audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + } + setSpeechStatus(true); + ttsPlayer + .play(audioBuffer, () => { + setSpeechStatus(false); + }) + .catch((e) => { + console.error("[OpenAI Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + }) + .finally(() => setSpeechLoading(false)); + } + } + const context: RenderMessage[] = useMemo(() => { return session.mask.hideContext ? [] : session.mask.context.slice(); }, [session.mask.context, session.mask.hideContext]); - const accessStore = useAccessStore(); if ( context.length === 0 && @@ -1723,6 +1778,25 @@ function _Chat() { ) } /> + {config.ttsConfig.enable && ( + + ) : ( + + ) + } + onClick={() => + openaiSpeech(getMessageTextContent(message)) + } + /> + )} )} @@ -1841,6 +1915,7 @@ function _Chat() { onSearch(""); }} setShowShortcutKeyModal={setShowShortcutKeyModal} + setUserInput={setUserInput} />