From bfc38e7e08032328f0db41967c10f1548dcda7e1 Mon Sep 17 00:00:00 2001
From: root <root@localhost.localdomain>
Date: Fri, 10 Jan 2025 20:24:39 +0800
Subject: [PATCH] 20250110 tts

---
 packages/plugin-tts/.npmignore        |   7 +
 packages/plugin-tts/README.md         | 173 +++++++++++++++
 packages/plugin-tts/eslint.config.mjs |   3 +
 packages/plugin-tts/package.json      |  34 +++
 packages/plugin-tts/src/constants.ts  | 301 ++++++++++++++++++++++++++
 packages/plugin-tts/src/index.ts      | 198 +++++++++++++++++
 packages/plugin-tts/tsconfig.json     |  11 +
 packages/plugin-tts/tsup.config.ts    |  22 ++
 8 files changed, 749 insertions(+)
 create mode 100644 packages/plugin-tts/.npmignore
 create mode 100644 packages/plugin-tts/README.md
 create mode 100644 packages/plugin-tts/eslint.config.mjs
 create mode 100644 packages/plugin-tts/package.json
 create mode 100644 packages/plugin-tts/src/constants.ts
 create mode 100644 packages/plugin-tts/src/index.ts
 create mode 100644 packages/plugin-tts/tsconfig.json
 create mode 100644 packages/plugin-tts/tsup.config.ts
diff --git a/packages/plugin-tts/.npmignore b/packages/plugin-tts/.npmignore
new file mode 100644
index 00000000000..a9227d220f6
--- /dev/null
+++ b/packages/plugin-tts/.npmignore
@@ -0,0 +1,7 @@
+*
+
+!dist/**
+!package.json
+!readme.md
+!tsup.config.ts
+!tsconfig.json
\ No newline at end of file
diff --git a/packages/plugin-tts/README.md b/packages/plugin-tts/README.md
new file mode 100644
index 00000000000..52e4bb5026f
--- /dev/null
+++ b/packages/plugin-tts/README.md
@@ -0,0 +1,173 @@
+# @elizaos/plugin-tts
+
+A plugin for text-to-speech(TTS) generation using the FAL.ai API within the ElizaOS ecosystem.
+
+## Description
+
+The text-to-speech(TTS) plugin enables AI-powered creation of speech through FAL.ai's services. It provides functionality to generate audio from text descriptions, automatically detects language, and selects appropriate voice models.
+
+## Installation
+
+```bash
+pnpm install @elizaos/plugin-tts
+```
+
+## Configuration
+
+The plugin requires the following environment variable or runtime setting to be set:
+
+```typescript
+FAL_API_KEY=<Your FAL.ai API key>
+```
+
+## Usage
+
+### Basic Integration
+
+```typescript
+import { TTSGenerationPlugin } from "@elizaos/plugin-tts";
+```
+
+### Voice Generation Examples
+
+```typescript
+// The plugin responds to natural language commands like:
+
+"Generate TTS of Hello World";
+"Create a TTS for Welcome to ElizaOS";
+"Make a TTS saying [your text]";
+```
+
+## API Reference
+
+### Actions
+
+#### GENERATE_TTS
+
+Generates speech audio based on text input.
+
+**Aliases:**
+- TTS_GENERATION
+- CREATE_TTS
+- TEXT2SPEECH
+- T2S
+- TEXT_TO_SPEECH
+- AUDIO_CREATE
+
+**Features:**
+- Automatic language detection
+- Voice selection based on detected language
+- Local file caching
+- Progress tracking
+- Error handling
+
+## Common Issues & Troubleshooting
+
+1. **Generation Failures**
+    - Verify FAL API key is correctly set
+    - Ensure text input is at least 3 characters long
+    - Check network connectivity to FAL.ai services
+
+2. **Storage Issues**
+    - Verify write permissions to content_cache directory
+    - Ensure sufficient disk space
+    - Check if content_cache directory exists
+
+## Security Best Practices
+
+1. **API Key Management**
+    - Store FAL API key securely using runtime settings or environment variables
+    - Never commit API keys to version control
+    - Monitor API usage
+
+## Development Guide
+
+### Setting Up Development Environment
+
+1. Clone the repository
+2. Install dependencies:
+
+```bash
+pnpm install
+```
+
+3. Build the plugin:
+
+```bash
+pnpm run build
+```
+
+4. Run the plugin:
+
+```bash
+pnpm run dev
+```
+
+## Future Enhancements
+
+1. **Advanced Voice Features**
+    - Custom voice model support
+    - Voice style transfer
+    - Emotion control
+    - Speech rate adjustment
+    - Pitch modification
+    - Multiple speaker support
+
+2. **Audio Processing**
+    - Background noise reduction
+    - Audio quality enhancement
+    - Format conversion options
+    - Volume normalization
+    - Audio effects processing
+    - Batch processing support
+
+3. **Language Support**
+    - Expanded language detection
+    - Regional accent support
+    - Dialect customization
+    - Pronunciation improvements
+    - Multi-language mixing
+    - Custom pronunciation rules
+
+4. **Integration Features**
+    - Streaming audio support
+    - Real-time generation
+    - Caching optimization
+    - Batch generation
+    - Queue management
+    - Progress monitoring
+
+5. **Developer Tools**
+    - Extended API options
+    - Testing framework
+    - Performance profiling
+    - Debug logging
+    - Integration examples
+    - Documentation generator
+
+We welcome community feedback and contributions to help prioritize these enhancements.
+
+## Contributing
+
+Contributions are welcome! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for more information.
+
+## Credits
+
+This plugin integrates with and builds upon several key technologies:
+
+- [FAL.ai](https://fal.ai/): AI model deployment platform
+- [langdetect](https://github.com/wooorm/franc): Language detection library
+- [ElizaOS](https://elizaos.com): Core framework
+
+Special thanks to:
+- The FAL.ai team for AI infrastructure
+- The langdetect development community
+- The Eliza community for their contributions and feedback
+
+For more information about TTS capabilities:
+- [FAL.ai Documentation](https://fal.ai/docs)
+- [ElizaOS Documentation](https://docs.elizaos.com)
+
+## License
+
+This plugin is part of the Eliza project. See the main project repository for license information.
\ No newline at end of file
diff --git a/packages/plugin-tts/eslint.config.mjs b/packages/plugin-tts/eslint.config.mjs
new file mode 100644
index 00000000000..92fe5bbebef
--- /dev/null
+++ b/packages/plugin-tts/eslint.config.mjs
@@ -0,0 +1,3 @@
+import eslintGlobalConfig from "../../eslint.config.mjs";
+
+export default [...eslintGlobalConfig];
diff --git a/packages/plugin-tts/package.json b/packages/plugin-tts/package.json
new file mode 100644
index 00000000000..9b339bd391d
--- /dev/null
+++ b/packages/plugin-tts/package.json
@@ -0,0 +1,34 @@
+{
+    "name": "@elizaos/plugin-tts",
+    "version": "0.1.7",
+    "type": "module",
+    "main": "dist/index.js",
+    "module": "dist/index.js",
+    "types": "dist/index.d.ts",
+    "exports": {
+        "./package.json": "./package.json",
+        ".": {
+            "import": {
+                "@elizaos/source": "./src/index.ts",
+                "types": "./dist/index.d.ts",
+                "default": "./dist/index.js"
+            }
+        }
+    },
+    "files": [
+        "dist"
+    ],
+    "dependencies": {
+        "@elizaos/core": "workspace:*",
+        "tsup": "8.3.5",
+        "whatwg-url": "7.1.0"
+    },
+    "scripts": {
+        "build": "tsup --format esm --dts",
+        "dev": "tsup --format esm --dts --watch",
+        "lint": "eslint --fix  --cache ."
+    },
+    "peerDependencies": {
+        "whatwg-url": "7.1.0"
+    }
+}
diff --git a/packages/plugin-tts/src/constants.ts b/packages/plugin-tts/src/constants.ts
new file mode 100644
index 00000000000..228b2bcac60
--- /dev/null
+++ b/packages/plugin-tts/src/constants.ts
@@ -0,0 +1,301 @@
+export const FAL_CONSTANTS = {
+    API_TTS_ENDPOINT: "fal-ai/playai/tts/v3",
+    API_KEY_SETTING: "FAL_API_KEY", // The setting name to fetch from runtime
+};
+
+export interface VoiceOption {
+    name: string;
+    style: "Conversational" | "Narrative" | "Advertising" | "Meditation";
+    region?: string;
+    fullName: string; 
+  }
+
+export const VOICE_MAP: Record<string, VoiceOption[]> = {
+    'en': [
+    { 
+    name: "Jennifer", 
+    style: "Conversational", 
+    region: "US/American",
+    fullName: "Jennifer (English (US)/American)"
+    },
+    { 
+    name: "Dexter", 
+    style: "Conversational", 
+    region: "US/American",
+    fullName: "Dexter (English (US)/American)"
+    },
+    { 
+    name: "Ava", 
+    style: "Conversational", 
+    region: "AU/Australian",
+    fullName: "Ava (English (AU)/Australian)"
+    },
+    { 
+    name: "Tilly", 
+    style: "Conversational", 
+    region: "AU/Australian",
+    fullName: "Tilly (English (AU)/Australian)"
+    },
+    { 
+    name: "Charlotte", 
+    style: "Advertising", 
+    region: "CA/Canadian",
+    fullName: "Charlotte (Advertising) (English (CA)/Canadian)"
+    },
+    { 
+    name: "Charlotte", 
+    style: "Meditation", 
+    region: "CA/Canadian",
+    fullName: "Charlotte (Meditation) (English (CA)/Canadian)"
+    },
+    { 
+    name: "Cecil", 
+    style: "Conversational", 
+    region: "GB/British",
+    fullName: "Cecil (English (GB)/British)"
+    },
+    { 
+    name: "Sterling", 
+    style: "Conversational", 
+    region: "GB/British",
+    fullName: "Sterling (English (GB)/British)"
+    },
+    { 
+    name: "Cillian", 
+    style: "Conversational", 
+    region: "IE/Irish",
+    fullName: "Cillian (English (IE)/Irish)"
+    },
+    { 
+    name: "Madison", 
+    style: "Conversational", 
+    region: "IE/Irish",
+    fullName: "Madison (English (IE)/Irish)"
+    },
+    { 
+    name: "Ada", 
+    style: "Conversational", 
+    region: "ZA/South african",
+    fullName: "Ada (English (ZA)/South african)"
+    },
+    { 
+    name: "Sumita", 
+    style: "Conversational", 
+    region: "IN/Indian",
+    fullName: "Sumita (English (IN)/Indian)"
+    },
+    { 
+    name: "Navya", 
+    style: "Conversational", 
+    region: "IN/Indian",
+    fullName: "Navya (English (IN)/Indian)"
+    }
+        ],
+    'ja': [
+    { 
+    name: "Kiriko", 
+    style: "Conversational", 
+    region: "Japanese",
+    fullName: "Kiriko Conversational (Japanese/Japanese)"
+    },
+    { 
+    name: "Kiriko", 
+    style: "Narrative", 
+    region: "Japanese",
+    fullName: "Kiriko Narrative (Japanese/Japanese)"
+    }
+        ],
+    'af': [
+    { 
+    name: "Ronel", 
+    style: "Conversational", 
+    region: "South african",
+    fullName: "Ronel Conversational (Afrikaans/South african)"
+    },
+    { 
+    name: "Ronel", 
+    style: "Narrative", 
+    region: "South african",
+    fullName: "Ronel Narrative (Afrikaans/South african)"
+    }
+        ],
+    'ar': [
+    { 
+    name: "Abdo", 
+    style: "Conversational", 
+    region: "Arabic",
+    fullName: "Abdo Conversational (Arabic/Arabic)"
+    },
+    { 
+    name: "Abdo", 
+    style: "Narrative", 
+    region: "Arabic",
+    fullName: "Abdo Narrative (Arabic/Arabic)"
+    }
+        ],
+    'bn': [
+    { 
+    name: "Mousmi", 
+    style: "Conversational", 
+    region: "Bengali",
+    fullName: "Mousmi Conversational (Bengali/Bengali)"
+    },
+    { 
+    name: "Mousmi", 
+    style: "Narrative", 
+    region: "Bengali",
+    fullName: "Mousmi Narrative (Bengali/Bengali)"
+    }
+        ],
+    'pt': [
+    { 
+    name: "Caroline", 
+    style: "Conversational", 
+    region: "Brazilian",
+    fullName: "Caroline Conversational (Portuguese (BR)/Brazilian)"
+    },
+    { 
+    name: "Caroline", 
+    style: "Narrative", 
+    region: "Brazilian", 
+    fullName: "Caroline Narrative (Portuguese (BR)/Brazilian)"
+    }
+        ],
+    'fr': [
+    { 
+    name: "Ange", 
+    style: "Conversational", 
+    region: "French",
+    fullName: "Ange Conversational (French/French)"
+    },
+    { 
+    name: "Ange", 
+    style: "Narrative", 
+    region: "French",
+    fullName: "Ange Narrative (French/French)"
+    },
+    { 
+    name: "Baptiste", 
+    style: "Conversational", 
+    region: "French",
+    fullName: "Baptiste (English (FR)/French)"
+    }
+        ],
+    'de': [
+        { 
+            name: "Anke", 
+            style: "Conversational", 
+            region: "German",
+            fullName: "Anke Conversational (German/German)"
+        },
+        { 
+            name: "Anke", 
+            style: "Narrative", 
+            region: "German",
+            fullName: "Anke Narrative (German/German)"
+        }
+        ],
+    'es': [
+        { 
+            name: "Carmen", 
+            style: "Conversational", 
+            region: "Spanish",
+            fullName: "Carmen Conversational (Spanish/Spanish)"
+        },
+        { 
+            name: "Patricia", 
+            style: "Conversational", 
+            region: "Spanish",
+            fullName: "Patricia Conversational (Spanish/Spanish)"
+        }
+        ],
+    'ko': [
+        { 
+            name: "Dohee", 
+            style: "Conversational", 
+            region: "Korean",
+            fullName: "Dohee Conversational (Korean/Korean)"
+        },
+        { 
+            name: "Dohee", 
+            style: "Narrative", 
+            region: "Korean",
+            fullName: "Dohee Narrative (Korean/Korean)"
+        }
+        ],
+    'he': [
+    { 
+      name: "Mary", 
+      style: "Conversational", 
+      region: "Israeli",
+      fullName: "Mary Conversational (Hebrew/Israeli)"
+    },
+    { 
+      name: "Mary", 
+      style: "Narrative", 
+      region: "Israeli",
+      fullName: "Mary Narrative (Hebrew/Israeli)"
+    }
+    ],
+    'ru': [
+        { 
+          name: "Andrei", 
+          style: "Conversational", 
+          region: "Russian",
+          fullName: "Andrei Conversational (Russian/Russian)"
+        },
+        { 
+          name: "Andrei", 
+          style: "Narrative", 
+          region: "Russian",
+          fullName: "Andrei Narrative (Russian/Russian)"
+        }
+      ],
+    'ne': [
+    { 
+        name: "Anuj", 
+        style: "Conversational", 
+        region: "Indian",
+        fullName: "Anuj Conversational (Hindi/Indian)"
+    },
+    { 
+        name: "Anuj", 
+        style: "Narrative", 
+        region: "Indian",
+        fullName: "Anuj Narrative (Hindi/Indian)"
+    }
+    ],
+    'th': [
+        { 
+          name: "Katbundit", 
+          style: "Conversational", 
+          region: "Thai",
+          fullName: "Katbundit Conversational (Thai/Thai)"
+        },
+        { 
+          name: "Katbundit", 
+          style: "Narrative", 
+          region: "Thai",
+          fullName: "Katbundit Narrative (Thai/Thai)"
+        }
+      ],
+    'tr': [
+    { 
+        name: "Ali", 
+        style: "Conversational", 
+        region: "Turkish",
+        fullName: "Ali Conversational (Turkish/Turkish)"
+    },
+    { 
+        name: "Ali", 
+        style: "Narrative", 
+        region: "Turkish",
+        fullName: "Ali Narrative (Turkish/Turkish)"
+    }
+    ],
+};
+
+export const getRandomVoice = (voiceOptions: VoiceOption[]): VoiceOption => {
+    const randomIndex = Math.floor(Math.random() * voiceOptions.length);
+    return voiceOptions[randomIndex];
+  };
diff --git a/packages/plugin-tts/src/index.ts b/packages/plugin-tts/src/index.ts
new file mode 100644
index 00000000000..0206ae8fb34
--- /dev/null
+++ b/packages/plugin-tts/src/index.ts
@@ -0,0 +1,198 @@
+import { elizaLogger } from "@elizaos/core";
+import {
+    Action,
+    HandlerCallback,
+    IAgentRuntime,
+    Memory,
+    Plugin,
+    State,
+} from "@elizaos/core";
+import { fal } from "@fal-ai/client";
+import { FAL_CONSTANTS, VOICE_MAP, getRandomVoice } from "./constants";
+
+import * as fs from "fs";
+import { Buffer } from "buffer";
+import * as path from "path";
+import * as process from "process";
+import { detect } from 'langdetect'; 
+
+const generateTTS = async (prompt: string, voice: string, runtime: IAgentRuntime) => {
+    process.env["FAL_KEY"] =
+        FAL_CONSTANTS.API_KEY_SETTING || runtime.getSetting("FAL_API_KEY");
+
+    try {
+        elizaLogger.log("Starting TTS generation with prompt:", prompt);
+
+        const response = await fal.subscribe(FAL_CONSTANTS.API_TTS_ENDPOINT, {
+            input: {
+                input: prompt,
+                voice: voice
+              },
+            logs: true,
+            onQueueUpdate: (update) => {
+                if (update.status === "IN_PROGRESS") {
+                    update.logs
+                        .map((log) => log.message)
+                        .forEach(elizaLogger.log);
+                }
+            },
+        });
+
+        elizaLogger.log(
+            "Generation request successful, received response:",
+            response
+        );
+
+        return {success: true, 
+                data: response.data};
+    } 
+    catch (error) {
+        elizaLogger.error("TTS generation error:", error);
+        return {
+            success: false,
+            error: error.message || "Unknown error occurred",
+        };
+    }
+};
+
+const TTSGeneration: Action = {
+    name: "GENERATE_TTS",
+    similes: [
+        "TTS_GENERATION",
+        "CREATE_TTS",
+        "TEXT2SPEECH",
+        "T2S",
+        "TEXT_TO_SPEECH",
+        "AUDIO_CREATE",
+    ],
+    description: "Generate a tts audio based on a text prompt",
+    validate: async (runtime: IAgentRuntime, _message: Memory) => {
+        elizaLogger.log("Validating TTS action");
+        const FalApiKey = runtime.getSetting("FAL_API_KEY");
+        elizaLogger.log("FAL_API_KEY present:", !!FalApiKey);
+        return !!FalApiKey;
+    },
+    handler: async (
+        runtime: IAgentRuntime,
+        message: Memory,
+        _state: State,
+        _options: any,
+        callback: HandlerCallback
+    ) => {
+        elizaLogger.log("TTS request:", message);
+
+        // Clean up the prompt by removing mentions and commands
+        const TTSPrompt = message.content.text
+            .replace(/<@\d+>/g, "") // Remove mentions
+            .replace(/generate TTS|create TTS|make TTS|render TTS/gi, "") // Remove commands
+            .trim();
+
+        if (!TTSPrompt || TTSPrompt.length < 3) {
+            callback({
+                text: "Please input a word at least of length 3",
+            });
+            return;
+        }
+
+        elizaLogger.log("TTS prompt:", TTSPrompt);
+
+        callback({
+            text: `I'll generate a audio based on your prompt: "${TTSPrompt}". This might take a few seconds...`,
+        });
+
+        const language = detect(TTSPrompt);
+        const voice_subject = VOICE_MAP[language[0].lang];
+        const target_voice = getRandomVoice(voice_subject).fullName;
+
+        elizaLogger.log("Starting TTS generation with prompt:", prompt, "and voice:", target_voice);
+
+        try {
+            const result = await generateTTS(TTSPrompt, target_voice, runtime);
+
+            if (result.success && result.data.audio.url) {
+                // Download the Audio file
+                const response = await fetch(result.data.audio.url);
+                const arrayBuffer = await response.arrayBuffer();
+                const TTSFileName = `content_cache/tts_${result.data.audio.file_name}`;
+
+                // ensure the directory is existed
+                const directoryPath = path.dirname(TTSFileName);
+                if (!fs.existsSync(directoryPath)) {
+                    fs.mkdirSync(directoryPath, { recursive: true });
+                }
+
+                // Save Audio file
+                fs.writeFileSync(TTSFileName, Buffer.from(arrayBuffer));
+
+                elizaLogger.log("Audio Duration:", result.data.audio.duration);
+                callback(
+                    {
+                        text: "TTS Success! Here's your generated audio!",
+                        attachments: [
+                            {
+                                id: crypto.randomUUID(),
+                                url: result.data.audio.url,
+                                title: "TTS Generation",
+                                source: "TTSGeneration",
+                                description: TTSPrompt,
+                                text: TTSPrompt,
+                            },
+                        ],
+                    },
+                    [TTSFileName]
+                ); // Add the audio file to the attachments
+            } else {
+                callback({
+                    text: `TTS generation failed: ${result.error}`,
+                    error: true,
+                });
+            }
+        } catch (error) {
+            elizaLogger.error(`Failed to generate TTS. Error: ${error}`);
+            callback({
+                text: `TTS generation failed: ${error.message}`,
+                error: true,
+            });
+        }
+    },
+    examples: [
+        [
+            {
+                user: "{{user1}}",
+                content: {
+                    text: "Generate a TTS of prompt: Hello world!",
+                },
+            },
+            {
+                user: "{{agentName}}",
+                content: {
+                    text: "I'll call a TTS to generate an audio based on your input prompt",
+                    action: "CREATE_TTS",
+                },
+            },
+        ],
+        [
+            {
+                user: "{{user1}}",
+                content: {
+                    text: "Please do TTS to a prompt: Sam is busy now",
+                },
+            },
+            {
+                user: "{{agentName}}",
+                content: {
+                    text: "Ok, please wait for the tts generation~",
+                    action: "AUDIO_CREATE",
+                },
+            },
+        ],
+    ],
+} as Action;
+
+export const TTSGenerationPlugin: Plugin = {
+    name: "TTSGeneration",
+    description: "Generate TTS using PlayAI tts (v3)",
+    actions: [TTSGeneration],
+    evaluators: [],
+    providers: [],
+};
diff --git a/packages/plugin-tts/tsconfig.json b/packages/plugin-tts/tsconfig.json
new file mode 100644
index 00000000000..d5b54aefd5f
--- /dev/null
+++ b/packages/plugin-tts/tsconfig.json
@@ -0,0 +1,11 @@
+{
+    "extends": "../core/tsconfig.json",
+    "compilerOptions": {
+        "outDir": "dist",
+        "rootDir": "src",
+        "module": "ESNext",
+        "moduleResolution": "Bundler",
+        "types": ["node"]
+    },
+    "include": ["src/**/*.ts"]
+}
diff --git a/packages/plugin-tts/tsup.config.ts b/packages/plugin-tts/tsup.config.ts
new file mode 100644
index 00000000000..7269091246e
--- /dev/null
+++ b/packages/plugin-tts/tsup.config.ts
@@ -0,0 +1,22 @@
+import { defineConfig } from "tsup";
+
+export default defineConfig({
+    entry: ["src/index.ts"],
+    outDir: "dist",
+    sourcemap: true,
+    clean: true,
+    format: ["esm"],
+    external: [
+        "dotenv",
+        "fs",
+        "path",
+        "process",
+        "@reflink/reflink",
+        "@node-llama-cpp",
+        "@fal-ai/client",
+        "langdetect",
+        "https",
+        "http",
+        "agentkeepalive",
+    ],
+});