DevXT-LLC · Josh-XT · Apr 7, 2024 · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -12,6 +12,8 @@ RUN git clone https://github.com/Josh-XT/DeepSeek-VL deepseek && \
     cd deepseek && \
     pip install --no-cache-dir -e . && \
     cd ..
+RUN pip install spacy && \
+    python -m spacy download en_core_web_sm
 COPY . .
 ENV HOST 0.0.0.0
 EXPOSE 8091

diff --git a/Pipes.py b/Pipes.py
@@ -7,6 +7,8 @@
 from pyngrok import ngrok
 import requests
 import base64
+import pdfplumber
+from typing import List
 
 try:
     from ezlocalai.IMG import IMG
@@ -71,6 +73,36 @@ def __init__(self):
         else:
             self.local_uri = os.environ.get("EZLOCALAI_URL", "http://localhost:8091")
 
+    async def pdf_to_audio(self, title, voice, pdf, chunk_size=200):
+        filename = f"{title}.pdf"
+        file_path = os.path.join(os.getcwd(), "outputs", filename)
+        pdf = pdf.split(",")[1]
+        pdf = base64.b64decode(pdf)
+        with open(file_path, "wb") as pdf_file:
+            pdf_file.write(pdf)
+        content = ""
+        if file_path.endswith(".pdf"):
+            with pdfplumber.open(file_path) as pdf:
+                content = "\n".join([page.extract_text() for page in pdf.pages])
+        if not content:
+            return
+        return await self.ctts.generate(
+            text=content,
+            voice=voice,
+            local_uri=self.local_uri,
+            output_file_name=f"{title}.wav",
+        )
+
+    async def audio_to_audio(self, voice, audio):
+        audio_type = audio.split(",")[0].split(":")[1].split(";")[0]
+        audio_format = audio_type.split("/")[1]
+        audio = audio.split(",")[1]
+        audio = base64.b64decode(audio)
+        text = self.stt.transcribe_audio(base64_audio=audio, audio_format=audio_format)
+        return await self.ctts.generate(
+            text=text, voice=voice, local_uri=self.local_uri
+        )
+
     async def get_response(self, data, completion_type="chat"):
         data["local_uri"] = self.local_uri
         images = []

diff --git a/app.py b/app.py
@@ -17,6 +17,7 @@
 import base64
 import os
 import logging
+import uuid
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -304,6 +305,21 @@ class TextToSpeech(BaseModel):
     dependencies=[Depends(verify_api_key)],
 )
 async def text_to_speech(tts: TextToSpeech, user=Depends(verify_api_key)):
+    if tts.input.startswith("data:"):
+        if "pdf" in tts.input:
+            audio = await pipe.pdf_to_audio(
+                title=tts.user if tts.user else f"{uuid.uuid4().hex}",
+                voice=tts.voice,
+                pdf=tts.input,
+                chunk_size=200,
+            )
+            return audio
+        if "audio/" in tts.input:
+            audio = await pipe.audio_to_audio(
+                voice=tts.voice,
+                audio=tts.input,
+            )
+            return audio
     audio = await pipe.ctts.generate(
         text=tts.input, voice=tts.voice, language=tts.language
     )

diff --git a/ezlocalai/CTTS.py b/ezlocalai/CTTS.py
@@ -8,6 +8,9 @@
 import logging
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
+from typing import List
+import spacy
+from pydub import AudioSegment
 
 try:
     import deepspeed
@@ -67,14 +70,42 @@ def __init__(self):
                 wav_files.append(file.replace(".wav", ""))
         self.voices = wav_files
 
+    def chunk_content(self, text: str, chunk_size: int) -> List[str]:
+        try:
+            sp = spacy.load("en_core_web_sm")
+        except:
+            spacy.cli.download("en_core_web_sm")
+            sp = spacy.load("en_core_web_sm")
+        sp.max_length = 99999999999999999999999
+        doc = sp(text)
+        sentences = list(doc.sents)
+        content_chunks = []
+        chunk = []
+        chunk_len = 0
+        for sentence in sentences:
+            sentence_tokens = len(sentence)
+            if chunk_len + sentence_tokens > chunk_size and chunk:
+                chunk_text = " ".join(token.text for token in chunk)
+                content_chunks.append(chunk_text)
+                chunk = []
+                chunk_len = 0
+            chunk.extend(sentence)
+            chunk_len += sentence_tokens
+        if chunk:
+            chunk_text = " ".join(token.text for token in chunk)
+            content_chunks.append(chunk_text)
+        return [chunk_text for chunk_text in content_chunks]
+
     async def generate(
         self,
         text,
         voice="default",
         language="en",
         local_uri=None,
+        output_file_name=None,
     ):
-        output_file_name = f"{uuid.uuid4().hex}.wav"
+        if not output_file_name:
+            output_file_name = f"{uuid.uuid4().hex}.wav"
         output_file = os.path.join(self.output_folder, output_file_name)
         cleaned_string = re.sub(r"([!?.])\1+", r"\1", text)
         cleaned_string = re.sub(
@@ -95,20 +126,52 @@ async def generate(
             max_ref_length=self.model.config.max_ref_len,
             sound_norm_refs=self.model.config.sound_norm_refs,
         )
-        output = self.model.inference(
-            text=text,
-            language=language,
-            gpt_cond_latent=gpt_cond_latent,
-            speaker_embedding=speaker_embedding,
-            temperature=0.7,
-            length_penalty=float(self.model.config.length_penalty),
-            repetition_penalty=10.0,
-            top_k=int(self.model.config.top_k),
-            top_p=float(self.model.config.top_p),
-            enable_text_splitting=True,
-        )
-        torchaudio.save(output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000)
-        torch.cuda.empty_cache()
+        if len(text) > 700:
+            text_chunks = self.chunk_content(text, 200)
+            output_files = []
+            for chunk in text_chunks:
+                output = self.model.inference(
+                    text=chunk,
+                    language=language,
+                    gpt_cond_latent=gpt_cond_latent,
+                    speaker_embedding=speaker_embedding,
+                    temperature=0.7,
+                    length_penalty=float(self.model.config.length_penalty),
+                    repetition_penalty=10.0,
+                    top_k=int(self.model.config.top_k),
+                    top_p=float(self.model.config.top_p),
+                    enable_text_splitting=True,
+                )
+                output_file_name = f"{uuid.uuid4().hex}.wav"
+                output_file = os.path.join(self.output_folder, output_file_name)
+                torchaudio.save(
+                    output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000
+                )
+                output_files.append(output_file)
+                torch.cuda.empty_cache()
+            combined_audio = AudioSegment.empty()
+            for file in output_files:
+                audio = AudioSegment.from_file(file)
+                combined_audio += audio
+                os.remove(file)
+            combined_audio.export(output_file, format="wav")
+        else:
+            output = self.model.inference(
+                text=text,
+                language=language,
+                gpt_cond_latent=gpt_cond_latent,
+                speaker_embedding=speaker_embedding,
+                temperature=0.7,
+                length_penalty=float(self.model.config.length_penalty),
+                repetition_penalty=10.0,
+                top_k=int(self.model.config.top_k),
+                top_p=float(self.model.config.top_p),
+                enable_text_splitting=True,
+            )
+            torchaudio.save(
+                output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000
+            )
+            torch.cuda.empty_cache()
         if local_uri:
             return f"{local_uri}/outputs/{output_file_name}"
         with open(output_file, "rb") as file:

diff --git a/requirements.txt b/requirements.txt
@@ -20,4 +20,6 @@ pyngrok==7.1.5
 accelerate==0.27.2
 python-multipart
 llama-cpp-python==0.2.55
-openai
+openai
+pdfplumber
+spacy