From bfc38e7e08032328f0db41967c10f1548dcda7e1 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 10 Jan 2025 20:24:39 +0800 Subject: [PATCH] 20250110 tts --- packages/plugin-tts/.npmignore | 7 + packages/plugin-tts/README.md | 173 +++++++++++++++ packages/plugin-tts/eslint.config.mjs | 3 + packages/plugin-tts/package.json | 34 +++ packages/plugin-tts/src/constants.ts | 301 ++++++++++++++++++++++++++ packages/plugin-tts/src/index.ts | 198 +++++++++++++++++ packages/plugin-tts/tsconfig.json | 11 + packages/plugin-tts/tsup.config.ts | 22 ++ 8 files changed, 749 insertions(+) create mode 100644 packages/plugin-tts/.npmignore create mode 100644 packages/plugin-tts/README.md create mode 100644 packages/plugin-tts/eslint.config.mjs create mode 100644 packages/plugin-tts/package.json create mode 100644 packages/plugin-tts/src/constants.ts create mode 100644 packages/plugin-tts/src/index.ts create mode 100644 packages/plugin-tts/tsconfig.json create mode 100644 packages/plugin-tts/tsup.config.ts diff --git a/packages/plugin-tts/.npmignore b/packages/plugin-tts/.npmignore new file mode 100644 index 00000000000..a9227d220f6 --- /dev/null +++ b/packages/plugin-tts/.npmignore @@ -0,0 +1,7 @@ +* + +!dist/** +!package.json +!readme.md +!tsup.config.ts +!tsconfig.json \ No newline at end of file diff --git a/packages/plugin-tts/README.md b/packages/plugin-tts/README.md new file mode 100644 index 00000000000..52e4bb5026f --- /dev/null +++ b/packages/plugin-tts/README.md @@ -0,0 +1,173 @@ +# @elizaos/plugin-tts + +A plugin for text-to-speech(TTS) generation using the FAL.ai API within the ElizaOS ecosystem. + +## Description + +The text-to-speech(TTS) plugin enables AI-powered creation of speech through FAL.ai's services. It provides functionality to generate audio from text descriptions, automatically detects language, and selects appropriate voice models. + +## Installation + +```bash +pnpm install @elizaos/plugin-tts +``` + +## Configuration + +The plugin requires the following environment variable or runtime setting to be set: + +```typescript +FAL_API_KEY= +``` + +## Usage + +### Basic Integration + +```typescript +import { TTSGenerationPlugin } from "@elizaos/plugin-tts"; +``` + +### Voice Generation Examples + +```typescript +// The plugin responds to natural language commands like: + +"Generate TTS of Hello World"; +"Create a TTS for Welcome to ElizaOS"; +"Make a TTS saying [your text]"; +``` + +## API Reference + +### Actions + +#### GENERATE_TTS + +Generates speech audio based on text input. + +**Aliases:** +- TTS_GENERATION +- CREATE_TTS +- TEXT2SPEECH +- T2S +- TEXT_TO_SPEECH +- AUDIO_CREATE + +**Features:** +- Automatic language detection +- Voice selection based on detected language +- Local file caching +- Progress tracking +- Error handling + +## Common Issues & Troubleshooting + +1. **Generation Failures** + - Verify FAL API key is correctly set + - Ensure text input is at least 3 characters long + - Check network connectivity to FAL.ai services + +2. **Storage Issues** + - Verify write permissions to content_cache directory + - Ensure sufficient disk space + - Check if content_cache directory exists + +## Security Best Practices + +1. **API Key Management** + - Store FAL API key securely using runtime settings or environment variables + - Never commit API keys to version control + - Monitor API usage + +## Development Guide + +### Setting Up Development Environment + +1. Clone the repository +2. Install dependencies: + +```bash +pnpm install +``` + +3. Build the plugin: + +```bash +pnpm run build +``` + +4. Run the plugin: + +```bash +pnpm run dev +``` + +## Future Enhancements + +1. **Advanced Voice Features** + - Custom voice model support + - Voice style transfer + - Emotion control + - Speech rate adjustment + - Pitch modification + - Multiple speaker support + +2. **Audio Processing** + - Background noise reduction + - Audio quality enhancement + - Format conversion options + - Volume normalization + - Audio effects processing + - Batch processing support + +3. **Language Support** + - Expanded language detection + - Regional accent support + - Dialect customization + - Pronunciation improvements + - Multi-language mixing + - Custom pronunciation rules + +4. **Integration Features** + - Streaming audio support + - Real-time generation + - Caching optimization + - Batch generation + - Queue management + - Progress monitoring + +5. **Developer Tools** + - Extended API options + - Testing framework + - Performance profiling + - Debug logging + - Integration examples + - Documentation generator + +We welcome community feedback and contributions to help prioritize these enhancements. + +## Contributing + +Contributions are welcome! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for more information. + +## Credits + +This plugin integrates with and builds upon several key technologies: + +- [FAL.ai](https://fal.ai/): AI model deployment platform +- [langdetect](https://github.com/wooorm/franc): Language detection library +- [ElizaOS](https://elizaos.com): Core framework + +Special thanks to: +- The FAL.ai team for AI infrastructure +- The langdetect development community +- The Eliza community for their contributions and feedback + +For more information about TTS capabilities: +- [FAL.ai Documentation](https://fal.ai/docs) +- [ElizaOS Documentation](https://docs.elizaos.com) + +## License + +This plugin is part of the Eliza project. See the main project repository for license information. \ No newline at end of file diff --git a/packages/plugin-tts/eslint.config.mjs b/packages/plugin-tts/eslint.config.mjs new file mode 100644 index 00000000000..92fe5bbebef --- /dev/null +++ b/packages/plugin-tts/eslint.config.mjs @@ -0,0 +1,3 @@ +import eslintGlobalConfig from "../../eslint.config.mjs"; + +export default [...eslintGlobalConfig]; diff --git a/packages/plugin-tts/package.json b/packages/plugin-tts/package.json new file mode 100644 index 00000000000..9b339bd391d --- /dev/null +++ b/packages/plugin-tts/package.json @@ -0,0 +1,34 @@ +{ + "name": "@elizaos/plugin-tts", + "version": "0.1.7", + "type": "module", + "main": "dist/index.js", + "module": "dist/index.js", + "types": "dist/index.d.ts", + "exports": { + "./package.json": "./package.json", + ".": { + "import": { + "@elizaos/source": "./src/index.ts", + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + } + } + }, + "files": [ + "dist" + ], + "dependencies": { + "@elizaos/core": "workspace:*", + "tsup": "8.3.5", + "whatwg-url": "7.1.0" + }, + "scripts": { + "build": "tsup --format esm --dts", + "dev": "tsup --format esm --dts --watch", + "lint": "eslint --fix --cache ." + }, + "peerDependencies": { + "whatwg-url": "7.1.0" + } +} diff --git a/packages/plugin-tts/src/constants.ts b/packages/plugin-tts/src/constants.ts new file mode 100644 index 00000000000..228b2bcac60 --- /dev/null +++ b/packages/plugin-tts/src/constants.ts @@ -0,0 +1,301 @@ +export const FAL_CONSTANTS = { + API_TTS_ENDPOINT: "fal-ai/playai/tts/v3", + API_KEY_SETTING: "FAL_API_KEY", // The setting name to fetch from runtime +}; + +export interface VoiceOption { + name: string; + style: "Conversational" | "Narrative" | "Advertising" | "Meditation"; + region?: string; + fullName: string; + } + +export const VOICE_MAP: Record = { + 'en': [ + { + name: "Jennifer", + style: "Conversational", + region: "US/American", + fullName: "Jennifer (English (US)/American)" + }, + { + name: "Dexter", + style: "Conversational", + region: "US/American", + fullName: "Dexter (English (US)/American)" + }, + { + name: "Ava", + style: "Conversational", + region: "AU/Australian", + fullName: "Ava (English (AU)/Australian)" + }, + { + name: "Tilly", + style: "Conversational", + region: "AU/Australian", + fullName: "Tilly (English (AU)/Australian)" + }, + { + name: "Charlotte", + style: "Advertising", + region: "CA/Canadian", + fullName: "Charlotte (Advertising) (English (CA)/Canadian)" + }, + { + name: "Charlotte", + style: "Meditation", + region: "CA/Canadian", + fullName: "Charlotte (Meditation) (English (CA)/Canadian)" + }, + { + name: "Cecil", + style: "Conversational", + region: "GB/British", + fullName: "Cecil (English (GB)/British)" + }, + { + name: "Sterling", + style: "Conversational", + region: "GB/British", + fullName: "Sterling (English (GB)/British)" + }, + { + name: "Cillian", + style: "Conversational", + region: "IE/Irish", + fullName: "Cillian (English (IE)/Irish)" + }, + { + name: "Madison", + style: "Conversational", + region: "IE/Irish", + fullName: "Madison (English (IE)/Irish)" + }, + { + name: "Ada", + style: "Conversational", + region: "ZA/South african", + fullName: "Ada (English (ZA)/South african)" + }, + { + name: "Sumita", + style: "Conversational", + region: "IN/Indian", + fullName: "Sumita (English (IN)/Indian)" + }, + { + name: "Navya", + style: "Conversational", + region: "IN/Indian", + fullName: "Navya (English (IN)/Indian)" + } + ], + 'ja': [ + { + name: "Kiriko", + style: "Conversational", + region: "Japanese", + fullName: "Kiriko Conversational (Japanese/Japanese)" + }, + { + name: "Kiriko", + style: "Narrative", + region: "Japanese", + fullName: "Kiriko Narrative (Japanese/Japanese)" + } + ], + 'af': [ + { + name: "Ronel", + style: "Conversational", + region: "South african", + fullName: "Ronel Conversational (Afrikaans/South african)" + }, + { + name: "Ronel", + style: "Narrative", + region: "South african", + fullName: "Ronel Narrative (Afrikaans/South african)" + } + ], + 'ar': [ + { + name: "Abdo", + style: "Conversational", + region: "Arabic", + fullName: "Abdo Conversational (Arabic/Arabic)" + }, + { + name: "Abdo", + style: "Narrative", + region: "Arabic", + fullName: "Abdo Narrative (Arabic/Arabic)" + } + ], + 'bn': [ + { + name: "Mousmi", + style: "Conversational", + region: "Bengali", + fullName: "Mousmi Conversational (Bengali/Bengali)" + }, + { + name: "Mousmi", + style: "Narrative", + region: "Bengali", + fullName: "Mousmi Narrative (Bengali/Bengali)" + } + ], + 'pt': [ + { + name: "Caroline", + style: "Conversational", + region: "Brazilian", + fullName: "Caroline Conversational (Portuguese (BR)/Brazilian)" + }, + { + name: "Caroline", + style: "Narrative", + region: "Brazilian", + fullName: "Caroline Narrative (Portuguese (BR)/Brazilian)" + } + ], + 'fr': [ + { + name: "Ange", + style: "Conversational", + region: "French", + fullName: "Ange Conversational (French/French)" + }, + { + name: "Ange", + style: "Narrative", + region: "French", + fullName: "Ange Narrative (French/French)" + }, + { + name: "Baptiste", + style: "Conversational", + region: "French", + fullName: "Baptiste (English (FR)/French)" + } + ], + 'de': [ + { + name: "Anke", + style: "Conversational", + region: "German", + fullName: "Anke Conversational (German/German)" + }, + { + name: "Anke", + style: "Narrative", + region: "German", + fullName: "Anke Narrative (German/German)" + } + ], + 'es': [ + { + name: "Carmen", + style: "Conversational", + region: "Spanish", + fullName: "Carmen Conversational (Spanish/Spanish)" + }, + { + name: "Patricia", + style: "Conversational", + region: "Spanish", + fullName: "Patricia Conversational (Spanish/Spanish)" + } + ], + 'ko': [ + { + name: "Dohee", + style: "Conversational", + region: "Korean", + fullName: "Dohee Conversational (Korean/Korean)" + }, + { + name: "Dohee", + style: "Narrative", + region: "Korean", + fullName: "Dohee Narrative (Korean/Korean)" + } + ], + 'he': [ + { + name: "Mary", + style: "Conversational", + region: "Israeli", + fullName: "Mary Conversational (Hebrew/Israeli)" + }, + { + name: "Mary", + style: "Narrative", + region: "Israeli", + fullName: "Mary Narrative (Hebrew/Israeli)" + } + ], + 'ru': [ + { + name: "Andrei", + style: "Conversational", + region: "Russian", + fullName: "Andrei Conversational (Russian/Russian)" + }, + { + name: "Andrei", + style: "Narrative", + region: "Russian", + fullName: "Andrei Narrative (Russian/Russian)" + } + ], + 'ne': [ + { + name: "Anuj", + style: "Conversational", + region: "Indian", + fullName: "Anuj Conversational (Hindi/Indian)" + }, + { + name: "Anuj", + style: "Narrative", + region: "Indian", + fullName: "Anuj Narrative (Hindi/Indian)" + } + ], + 'th': [ + { + name: "Katbundit", + style: "Conversational", + region: "Thai", + fullName: "Katbundit Conversational (Thai/Thai)" + }, + { + name: "Katbundit", + style: "Narrative", + region: "Thai", + fullName: "Katbundit Narrative (Thai/Thai)" + } + ], + 'tr': [ + { + name: "Ali", + style: "Conversational", + region: "Turkish", + fullName: "Ali Conversational (Turkish/Turkish)" + }, + { + name: "Ali", + style: "Narrative", + region: "Turkish", + fullName: "Ali Narrative (Turkish/Turkish)" + } + ], +}; + +export const getRandomVoice = (voiceOptions: VoiceOption[]): VoiceOption => { + const randomIndex = Math.floor(Math.random() * voiceOptions.length); + return voiceOptions[randomIndex]; + }; diff --git a/packages/plugin-tts/src/index.ts b/packages/plugin-tts/src/index.ts new file mode 100644 index 00000000000..0206ae8fb34 --- /dev/null +++ b/packages/plugin-tts/src/index.ts @@ -0,0 +1,198 @@ +import { elizaLogger } from "@elizaos/core"; +import { + Action, + HandlerCallback, + IAgentRuntime, + Memory, + Plugin, + State, +} from "@elizaos/core"; +import { fal } from "@fal-ai/client"; +import { FAL_CONSTANTS, VOICE_MAP, getRandomVoice } from "./constants"; + +import * as fs from "fs"; +import { Buffer } from "buffer"; +import * as path from "path"; +import * as process from "process"; +import { detect } from 'langdetect'; + +const generateTTS = async (prompt: string, voice: string, runtime: IAgentRuntime) => { + process.env["FAL_KEY"] = + FAL_CONSTANTS.API_KEY_SETTING || runtime.getSetting("FAL_API_KEY"); + + try { + elizaLogger.log("Starting TTS generation with prompt:", prompt); + + const response = await fal.subscribe(FAL_CONSTANTS.API_TTS_ENDPOINT, { + input: { + input: prompt, + voice: voice + }, + logs: true, + onQueueUpdate: (update) => { + if (update.status === "IN_PROGRESS") { + update.logs + .map((log) => log.message) + .forEach(elizaLogger.log); + } + }, + }); + + elizaLogger.log( + "Generation request successful, received response:", + response + ); + + return {success: true, + data: response.data}; + } + catch (error) { + elizaLogger.error("TTS generation error:", error); + return { + success: false, + error: error.message || "Unknown error occurred", + }; + } +}; + +const TTSGeneration: Action = { + name: "GENERATE_TTS", + similes: [ + "TTS_GENERATION", + "CREATE_TTS", + "TEXT2SPEECH", + "T2S", + "TEXT_TO_SPEECH", + "AUDIO_CREATE", + ], + description: "Generate a tts audio based on a text prompt", + validate: async (runtime: IAgentRuntime, _message: Memory) => { + elizaLogger.log("Validating TTS action"); + const FalApiKey = runtime.getSetting("FAL_API_KEY"); + elizaLogger.log("FAL_API_KEY present:", !!FalApiKey); + return !!FalApiKey; + }, + handler: async ( + runtime: IAgentRuntime, + message: Memory, + _state: State, + _options: any, + callback: HandlerCallback + ) => { + elizaLogger.log("TTS request:", message); + + // Clean up the prompt by removing mentions and commands + const TTSPrompt = message.content.text + .replace(/<@\d+>/g, "") // Remove mentions + .replace(/generate TTS|create TTS|make TTS|render TTS/gi, "") // Remove commands + .trim(); + + if (!TTSPrompt || TTSPrompt.length < 3) { + callback({ + text: "Please input a word at least of length 3", + }); + return; + } + + elizaLogger.log("TTS prompt:", TTSPrompt); + + callback({ + text: `I'll generate a audio based on your prompt: "${TTSPrompt}". This might take a few seconds...`, + }); + + const language = detect(TTSPrompt); + const voice_subject = VOICE_MAP[language[0].lang]; + const target_voice = getRandomVoice(voice_subject).fullName; + + elizaLogger.log("Starting TTS generation with prompt:", prompt, "and voice:", target_voice); + + try { + const result = await generateTTS(TTSPrompt, target_voice, runtime); + + if (result.success && result.data.audio.url) { + // Download the Audio file + const response = await fetch(result.data.audio.url); + const arrayBuffer = await response.arrayBuffer(); + const TTSFileName = `content_cache/tts_${result.data.audio.file_name}`; + + // ensure the directory is existed + const directoryPath = path.dirname(TTSFileName); + if (!fs.existsSync(directoryPath)) { + fs.mkdirSync(directoryPath, { recursive: true }); + } + + // Save Audio file + fs.writeFileSync(TTSFileName, Buffer.from(arrayBuffer)); + + elizaLogger.log("Audio Duration:", result.data.audio.duration); + callback( + { + text: "TTS Success! Here's your generated audio!", + attachments: [ + { + id: crypto.randomUUID(), + url: result.data.audio.url, + title: "TTS Generation", + source: "TTSGeneration", + description: TTSPrompt, + text: TTSPrompt, + }, + ], + }, + [TTSFileName] + ); // Add the audio file to the attachments + } else { + callback({ + text: `TTS generation failed: ${result.error}`, + error: true, + }); + } + } catch (error) { + elizaLogger.error(`Failed to generate TTS. Error: ${error}`); + callback({ + text: `TTS generation failed: ${error.message}`, + error: true, + }); + } + }, + examples: [ + [ + { + user: "{{user1}}", + content: { + text: "Generate a TTS of prompt: Hello world!", + }, + }, + { + user: "{{agentName}}", + content: { + text: "I'll call a TTS to generate an audio based on your input prompt", + action: "CREATE_TTS", + }, + }, + ], + [ + { + user: "{{user1}}", + content: { + text: "Please do TTS to a prompt: Sam is busy now", + }, + }, + { + user: "{{agentName}}", + content: { + text: "Ok, please wait for the tts generation~", + action: "AUDIO_CREATE", + }, + }, + ], + ], +} as Action; + +export const TTSGenerationPlugin: Plugin = { + name: "TTSGeneration", + description: "Generate TTS using PlayAI tts (v3)", + actions: [TTSGeneration], + evaluators: [], + providers: [], +}; diff --git a/packages/plugin-tts/tsconfig.json b/packages/plugin-tts/tsconfig.json new file mode 100644 index 00000000000..d5b54aefd5f --- /dev/null +++ b/packages/plugin-tts/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "../core/tsconfig.json", + "compilerOptions": { + "outDir": "dist", + "rootDir": "src", + "module": "ESNext", + "moduleResolution": "Bundler", + "types": ["node"] + }, + "include": ["src/**/*.ts"] +} diff --git a/packages/plugin-tts/tsup.config.ts b/packages/plugin-tts/tsup.config.ts new file mode 100644 index 00000000000..7269091246e --- /dev/null +++ b/packages/plugin-tts/tsup.config.ts @@ -0,0 +1,22 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + entry: ["src/index.ts"], + outDir: "dist", + sourcemap: true, + clean: true, + format: ["esm"], + external: [ + "dotenv", + "fs", + "path", + "process", + "@reflink/reflink", + "@node-llama-cpp", + "@fal-ai/client", + "langdetect", + "https", + "http", + "agentkeepalive", + ], +});