diff --git a/agent/src/index.ts b/agent/src/index.ts index 803acfd895..a1ad2f470b 100644 --- a/agent/src/index.ts +++ b/agent/src/index.ts @@ -95,17 +95,10 @@ export async function loadCharacters( characterPaths, cwd: process.cwd(), dirname: __dirname, - fullPath: path.resolve( - process.cwd(), - "characters/8bitoracle.laozi.character.json" - ), - exists: fs.existsSync( - path.resolve( - process.cwd(), - "characters/8bitoracle.laozi.character.json" - ) - ), dirContents: fs.readdirSync(process.cwd()), + characters: fs + .readdirSync(path.join(process.cwd(), "characters")) + .filter((file) => file.endsWith(".character.json")), }); if (characterPaths?.length > 0) { diff --git a/packages/adapter-postgres/src/index.ts b/packages/adapter-postgres/src/index.ts index 9d8a59101b..bceeae1415 100644 --- a/packages/adapter-postgres/src/index.ts +++ b/packages/adapter-postgres/src/index.ts @@ -174,12 +174,33 @@ export class PostgresDatabaseAdapter async init() { await this.testConnection(); - const schema = fs.readFileSync( - path.resolve(__dirname, "../schema.sql"), - "utf8" - ); + const client = await this.pool.connect(); + try { + await client.query("BEGIN"); + + // Check if schema already exists (check for a core table) + const { rows } = await client.query(` + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'rooms' + ); + `); + + if (!rows[0].exists) { + const schema = fs.readFileSync( + path.resolve(__dirname, "../schema.sql"), + "utf8" + ); + await client.query(schema); + } - await this.query(schema); + await client.query("COMMIT"); + } catch (error) { + await client.query("ROLLBACK"); + throw error; + } finally { + client.release(); + } } async close() { diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 1edd49de65..b900328e88 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -674,8 +674,17 @@ export type Character = { secrets?: { [key: string]: string }; buttplug?: boolean; voice?: { - model?: string; - url?: string; + model?: string; // For VITS + url?: string; // Legacy VITS support + elevenlabs?: { + // New structured ElevenLabs config + voiceId: string; + model?: string; + stability?: string; + similarityBoost?: string; + style?: string; + useSpeakerBoost?: string; + }; }; model?: string; embeddingModel?: string; diff --git a/packages/plugin-node/src/enviroment.ts b/packages/plugin-node/src/enviroment.ts index 779704cb42..5d30bb93ed 100644 --- a/packages/plugin-node/src/enviroment.ts +++ b/packages/plugin-node/src/enviroment.ts @@ -3,27 +3,21 @@ import { z } from "zod"; export const nodeEnvSchema = z.object({ OPENAI_API_KEY: z.string().min(1, "OpenAI API key is required"), + + // Core settings ELEVENLABS_XI_API_KEY: z.string().optional(), - ELEVENLABS_MODEL_ID: z.string().min(1, "ElevenLabs model ID is required"), - ELEVENLABS_VOICE_ID: z.string().min(1, "ElevenLabs voice ID is required"), - ELEVENLABS_VOICE_STABILITY: z - .string() - .min(1, "ElevenLabs voice stability is required"), - ELEVENLABS_VOICE_SIMILARITY_BOOST: z - .string() - .min(1, "ElevenLabs voice similarity boost is required"), - ELEVENLABS_VOICE_STYLE: z - .string() - .min(1, "ElevenLabs voice style is required"), - ELEVENLABS_VOICE_USE_SPEAKER_BOOST: z - .string() - .min(1, "ElevenLabs voice speaker boost setting is required"), - ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: z - .string() - .min(1, "ElevenLabs streaming latency optimization is required"), - ELEVENLABS_OUTPUT_FORMAT: z - .string() - .min(1, "ElevenLabs output format is required"), + + // All other settings optional with defaults + ELEVENLABS_MODEL_ID: z.string().optional(), + ELEVENLABS_VOICE_ID: z.string().optional(), + ELEVENLABS_VOICE_STABILITY: z.string().optional(), + ELEVENLABS_VOICE_SIMILARITY_BOOST: z.string().optional(), + ELEVENLABS_VOICE_STYLE: z.string().optional(), + ELEVENLABS_VOICE_USE_SPEAKER_BOOST: z.string().optional(), + ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: z.string().optional(), + ELEVENLABS_OUTPUT_FORMAT: z.string().optional(), + VITS_VOICE: z.string().optional(), + VITS_MODEL: z.string().optional(), }); export type NodeConfig = z.infer; @@ -32,34 +26,51 @@ export async function validateNodeConfig( runtime: IAgentRuntime ): Promise { try { + const voiceSettings = runtime.character.settings?.voice; + const elevenlabs = voiceSettings?.elevenlabs; + + // Only include what's absolutely required const config = { OPENAI_API_KEY: runtime.getSetting("OPENAI_API_KEY") || process.env.OPENAI_API_KEY, - ELEVENLABS_MODEL_ID: - runtime.getSetting("ELEVENLABS_MODEL_ID") || - process.env.ELEVENLABS_MODEL_ID, - ELEVENLABS_VOICE_ID: - runtime.getSetting("ELEVENLABS_VOICE_ID") || - process.env.ELEVENLABS_VOICE_ID, - ELEVENLABS_VOICE_STABILITY: - runtime.getSetting("ELEVENLABS_VOICE_STABILITY") || - process.env.ELEVENLABS_VOICE_STABILITY, - ELEVENLABS_VOICE_SIMILARITY_BOOST: - runtime.getSetting("ELEVENLABS_VOICE_SIMILARITY_BOOST") || - process.env.ELEVENLABS_VOICE_SIMILARITY_BOOST, - ELEVENLABS_VOICE_STYLE: - runtime.getSetting("ELEVENLABS_VOICE_STYLE") || - process.env.ELEVENLABS_VOICE_STYLE, - ELEVENLABS_VOICE_USE_SPEAKER_BOOST: - runtime.getSetting("ELEVENLABS_VOICE_USE_SPEAKER_BOOST") || - process.env.ELEVENLABS_VOICE_USE_SPEAKER_BOOST, - ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: - runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY") || - process.env.ELEVENLABS_OPTIMIZE_STREAMING_LATENCY, - ELEVENLABS_OUTPUT_FORMAT: - runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT") || - process.env.ELEVENLABS_OUTPUT_FORMAT, + ELEVENLABS_XI_API_KEY: + runtime.getSetting("ELEVENLABS_XI_API_KEY") || + process.env.ELEVENLABS_XI_API_KEY, + + // Use character card settings first, fall back to env vars, then defaults + ...(runtime.getSetting("ELEVENLABS_XI_API_KEY") && { + ELEVENLABS_MODEL_ID: + elevenlabs?.model || + process.env.ELEVENLABS_MODEL_ID || + "eleven_monolingual_v1", + ELEVENLABS_VOICE_ID: + elevenlabs?.voiceId || process.env.ELEVENLABS_VOICE_ID, + ELEVENLABS_VOICE_STABILITY: + elevenlabs?.stability || + process.env.ELEVENLABS_VOICE_STABILITY || + "0.5", + ELEVENLABS_VOICE_SIMILARITY_BOOST: + elevenlabs?.similarityBoost || + process.env.ELEVENLABS_VOICE_SIMILARITY_BOOST || + "0.75", + ELEVENLABS_VOICE_STYLE: + elevenlabs?.style || + process.env.ELEVENLABS_VOICE_STYLE || + "0", + ELEVENLABS_VOICE_USE_SPEAKER_BOOST: + elevenlabs?.useSpeakerBoost || + process.env.ELEVENLABS_VOICE_USE_SPEAKER_BOOST || + "true", + ELEVENLABS_OPTIMIZE_STREAMING_LATENCY: + process.env.ELEVENLABS_OPTIMIZE_STREAMING_LATENCY || "0", + ELEVENLABS_OUTPUT_FORMAT: + process.env.ELEVENLABS_OUTPUT_FORMAT || "pcm_16000", + }), + + // VITS settings + VITS_VOICE: voiceSettings?.model || process.env.VITS_VOICE, + VITS_MODEL: process.env.VITS_MODEL, }; return nodeEnvSchema.parse(config); diff --git a/packages/plugin-node/src/services/speech.ts b/packages/plugin-node/src/services/speech.ts index 58533f804b..d44c28fb69 100644 --- a/packages/plugin-node/src/services/speech.ts +++ b/packages/plugin-node/src/services/speech.ts @@ -4,6 +4,7 @@ import { getWavHeader } from "./audioUtils.ts"; import { Service } from "@ai16z/eliza"; import { validateNodeConfig } from "../enviroment.ts"; import * as Echogarden from "echogarden"; +import { elizaLogger } from "@ai16z/eliza"; function prependWavHeader( readable: Readable, @@ -33,12 +34,50 @@ function prependWavHeader( return passThrough; } +async function getVoiceSettings(runtime: IAgentRuntime) { + const hasElevenLabs = !!runtime.getSetting("ELEVENLABS_XI_API_KEY"); + const useVits = !hasElevenLabs; + + // Get voice settings from character card + const voiceSettings = runtime.character.settings?.voice; + const elevenlabsSettings = voiceSettings?.elevenlabs; + + elizaLogger.debug("Voice settings:", { + hasElevenLabs, + useVits, + voiceSettings, + elevenlabsSettings, + }); + + return { + elevenlabsVoiceId: + elevenlabsSettings?.voiceId || + runtime.getSetting("ELEVENLABS_VOICE_ID"), + elevenlabsModel: + elevenlabsSettings?.model || + runtime.getSetting("ELEVENLABS_MODEL_ID") || + "eleven_monolingual_v1", + elevenlabsStability: + elevenlabsSettings?.stability || + runtime.getSetting("ELEVENLABS_VOICE_STABILITY") || + "0.5", + // ... other ElevenLabs settings ... + vitsVoice: + voiceSettings?.model || + voiceSettings?.url || + runtime.getSetting("VITS_VOICE") || + "en_US-hfc_female-medium", + useVits, + }; +} + async function textToSpeech(runtime: IAgentRuntime, text: string) { await validateNodeConfig(runtime); + const { elevenlabsVoiceId } = await getVoiceSettings(runtime); try { const response = await fetch( - `https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`, + `https://api.elevenlabs.io/v1/text-to-speech/${elevenlabsVoiceId}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`, { method: "POST", headers: { @@ -125,9 +164,10 @@ async function textToSpeech(runtime: IAgentRuntime, text: string) { } catch (error) { if (error.message === "QUOTA_EXCEEDED") { // Fall back to VITS + const { vitsVoice } = await getVoiceSettings(runtime); const { audio } = await Echogarden.synthesize(text, { engine: "vits", - voice: "en_US-hfc_female-medium", + voice: vitsVoice, }); let wavStream: Readable; @@ -173,6 +213,53 @@ async function textToSpeech(runtime: IAgentRuntime, text: string) { } } +async function processVitsAudio(audio: any): Promise { + let wavStream: Readable; + if (audio instanceof Buffer) { + console.log("audio is a buffer"); + wavStream = Readable.from(audio); + } else if ("audioChannels" in audio && "sampleRate" in audio) { + console.log("audio is a RawAudio"); + const floatBuffer = Buffer.from(audio.audioChannels[0].buffer); + console.log("buffer length: ", floatBuffer.length); + + const sampleRate = audio.sampleRate; + const floatArray = new Float32Array(floatBuffer.buffer); + const pcmBuffer = new Int16Array(floatArray.length); + + for (let i = 0; i < floatArray.length; i++) { + pcmBuffer[i] = Math.round(floatArray[i] * 32767); + } + + const wavHeaderBuffer = getWavHeader( + pcmBuffer.length * 2, + sampleRate, + 1, + 16 + ); + const wavBuffer = Buffer.concat([ + wavHeaderBuffer, + Buffer.from(pcmBuffer.buffer), + ]); + wavStream = Readable.from(wavBuffer); + } else { + throw new Error("Unsupported audio format"); + } + return wavStream; +} + +async function generateVitsAudio( + runtime: IAgentRuntime, + text: string +): Promise { + const { vitsVoice } = await getVoiceSettings(runtime); + const { audio } = await Echogarden.synthesize(text, { + engine: "vits", + voice: vitsVoice, + }); + return processVitsAudio(audio); +} + export class SpeechService extends Service implements ISpeechService { static serviceType: ServiceType = ServiceType.SPEECH_GENERATION; @@ -184,103 +271,16 @@ export class SpeechService extends Service implements ISpeechService { async generate(runtime: IAgentRuntime, text: string): Promise { try { - // check for elevenlabs API key - if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) { - return await textToSpeech(runtime, text); - } - - // Default to VITS if no ElevenLabs API key - const { audio } = await Echogarden.synthesize(text, { - engine: "vits", - voice: "en_US-hfc_female-medium", - }); - - let wavStream: Readable; - if (audio instanceof Buffer) { - console.log("audio is a buffer"); - wavStream = Readable.from(audio); - } else if ("audioChannels" in audio && "sampleRate" in audio) { - console.log("audio is a RawAudio"); - const floatBuffer = Buffer.from(audio.audioChannels[0].buffer); - console.log("buffer length: ", floatBuffer.length); - - // Get the sample rate from the RawAudio object - const sampleRate = audio.sampleRate; - - // Create a Float32Array view of the floatBuffer - const floatArray = new Float32Array(floatBuffer.buffer); - - // Convert 32-bit float audio to 16-bit PCM - const pcmBuffer = new Int16Array(floatArray.length); - for (let i = 0; i < floatArray.length; i++) { - pcmBuffer[i] = Math.round(floatArray[i] * 32767); - } - - // Prepend WAV header to the buffer - const wavHeaderBuffer = getWavHeader( - pcmBuffer.length * 2, - sampleRate, - 1, - 16 - ); - const wavBuffer = Buffer.concat([ - wavHeaderBuffer, - Buffer.from(pcmBuffer.buffer), - ]); + const { useVits } = await getVoiceSettings(runtime); - wavStream = Readable.from(wavBuffer); - } else { - throw new Error("Unsupported audio format"); + if (useVits || !runtime.getSetting("ELEVENLABS_XI_API_KEY")) { + return await generateVitsAudio(runtime, text); } - return wavStream; + return await textToSpeech(runtime, text); } catch (error) { console.error("Speech generation error:", error); - // If ElevenLabs fails for any reason, fall back to VITS - const { audio } = await Echogarden.synthesize(text, { - engine: "vits", - voice: "en_US-hfc_female-medium", - }); - - let wavStream: Readable; - if (audio instanceof Buffer) { - console.log("audio is a buffer"); - wavStream = Readable.from(audio); - } else if ("audioChannels" in audio && "sampleRate" in audio) { - console.log("audio is a RawAudio"); - const floatBuffer = Buffer.from(audio.audioChannels[0].buffer); - console.log("buffer length: ", floatBuffer.length); - - // Get the sample rate from the RawAudio object - const sampleRate = audio.sampleRate; - - // Create a Float32Array view of the floatBuffer - const floatArray = new Float32Array(floatBuffer.buffer); - - // Convert 32-bit float audio to 16-bit PCM - const pcmBuffer = new Int16Array(floatArray.length); - for (let i = 0; i < floatArray.length; i++) { - pcmBuffer[i] = Math.round(floatArray[i] * 32767); - } - - // Prepend WAV header to the buffer - const wavHeaderBuffer = getWavHeader( - pcmBuffer.length * 2, - sampleRate, - 1, - 16 - ); - const wavBuffer = Buffer.concat([ - wavHeaderBuffer, - Buffer.from(pcmBuffer.buffer), - ]); - - wavStream = Readable.from(wavBuffer); - } else { - throw new Error("Unsupported audio format"); - } - - return wavStream; + return await generateVitsAudio(runtime, text); } } } diff --git a/packages/plugin-node/src/vendor/vitsVoiceList.ts b/packages/plugin-node/src/vendor/vitsVoiceList.ts index a8527c8d2d..2f6720f157 100644 --- a/packages/plugin-node/src/vendor/vitsVoiceList.ts +++ b/packages/plugin-node/src/vendor/vitsVoiceList.ts @@ -1,12 +1,9 @@ -type SynthesisVoice = { +export interface SynthesisVoice { name: string; languages: string[]; gender: string; speakerCount?: number; - localService?: boolean; - voiceURI?: string; - default?: boolean; -}; +} export const vitsVoiceList: SynthesisVoice[] = [ { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 119aa16f1d..98d28901cc 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -78,7 +78,7 @@ importers: version: 9.15.0(jiti@2.4.0) eslint-config-prettier: specifier: 9.1.0 - version: 9.1.0(eslint@9.15.0(jiti@2.4.0)) + version: 9.1.0(eslint@9.16.0(jiti@2.4.0)) husky: specifier: 9.1.7 version: 9.1.7 @@ -32580,7 +32580,7 @@ snapshots: ref@1.3.5: dependencies: - bindings: 1.5.0 + bindings: 1.2.1 debug: 2.6.9 nan: 2.22.0 transitivePeerDependencies: