Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PDF to Audio and Audio to Audio #37

Merged
merged 5 commits into from
Apr 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ RUN git clone https://github.com/Josh-XT/DeepSeek-VL deepseek && \
cd deepseek && \
pip install --no-cache-dir -e . && \
cd ..
RUN pip install spacy && \
python -m spacy download en_core_web_sm
COPY . .
ENV HOST 0.0.0.0
EXPOSE 8091
Expand Down
32 changes: 32 additions & 0 deletions Pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from pyngrok import ngrok
import requests
import base64
import pdfplumber
from typing import List

try:
from ezlocalai.IMG import IMG
Expand Down Expand Up @@ -71,6 +73,36 @@ def __init__(self):
else:
self.local_uri = os.environ.get("EZLOCALAI_URL", "http://localhost:8091")

async def pdf_to_audio(self, title, voice, pdf, chunk_size=200):
filename = f"{title}.pdf"
file_path = os.path.join(os.getcwd(), "outputs", filename)
pdf = pdf.split(",")[1]
pdf = base64.b64decode(pdf)
with open(file_path, "wb") as pdf_file:
pdf_file.write(pdf)
content = ""
if file_path.endswith(".pdf"):
with pdfplumber.open(file_path) as pdf:
content = "\n".join([page.extract_text() for page in pdf.pages])
if not content:
return
return await self.ctts.generate(
text=content,
voice=voice,
local_uri=self.local_uri,
output_file_name=f"{title}.wav",
)

async def audio_to_audio(self, voice, audio):
audio_type = audio.split(",")[0].split(":")[1].split(";")[0]
audio_format = audio_type.split("/")[1]
audio = audio.split(",")[1]
audio = base64.b64decode(audio)
text = self.stt.transcribe_audio(base64_audio=audio, audio_format=audio_format)
return await self.ctts.generate(
text=text, voice=voice, local_uri=self.local_uri
)

async def get_response(self, data, completion_type="chat"):
data["local_uri"] = self.local_uri
images = []
Expand Down
16 changes: 16 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import base64
import os
import logging
import uuid
from dotenv import load_dotenv

load_dotenv()
Expand Down Expand Up @@ -304,6 +305,21 @@ class TextToSpeech(BaseModel):
dependencies=[Depends(verify_api_key)],
)
async def text_to_speech(tts: TextToSpeech, user=Depends(verify_api_key)):
if tts.input.startswith("data:"):
if "pdf" in tts.input:
audio = await pipe.pdf_to_audio(
title=tts.user if tts.user else f"{uuid.uuid4().hex}",
voice=tts.voice,
pdf=tts.input,
chunk_size=200,
)
return audio
if "audio/" in tts.input:
audio = await pipe.audio_to_audio(
voice=tts.voice,
audio=tts.input,
)
return audio
audio = await pipe.ctts.generate(
text=tts.input, voice=tts.voice, language=tts.language
)
Expand Down
93 changes: 78 additions & 15 deletions ezlocalai/CTTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import logging
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from typing import List
import spacy
from pydub import AudioSegment

try:
import deepspeed
Expand Down Expand Up @@ -67,14 +70,42 @@ def __init__(self):
wav_files.append(file.replace(".wav", ""))
self.voices = wav_files

def chunk_content(self, text: str, chunk_size: int) -> List[str]:
try:
sp = spacy.load("en_core_web_sm")
except:
spacy.cli.download("en_core_web_sm")
sp = spacy.load("en_core_web_sm")
sp.max_length = 99999999999999999999999
doc = sp(text)
sentences = list(doc.sents)
content_chunks = []
chunk = []
chunk_len = 0
for sentence in sentences:
sentence_tokens = len(sentence)
if chunk_len + sentence_tokens > chunk_size and chunk:
chunk_text = " ".join(token.text for token in chunk)
content_chunks.append(chunk_text)
chunk = []
chunk_len = 0
chunk.extend(sentence)
chunk_len += sentence_tokens
if chunk:
chunk_text = " ".join(token.text for token in chunk)
content_chunks.append(chunk_text)
return [chunk_text for chunk_text in content_chunks]

async def generate(
self,
text,
voice="default",
language="en",
local_uri=None,
output_file_name=None,
):
output_file_name = f"{uuid.uuid4().hex}.wav"
if not output_file_name:
output_file_name = f"{uuid.uuid4().hex}.wav"
output_file = os.path.join(self.output_folder, output_file_name)
cleaned_string = re.sub(r"([!?.])\1+", r"\1", text)
cleaned_string = re.sub(
Expand All @@ -95,20 +126,52 @@ async def generate(
max_ref_length=self.model.config.max_ref_len,
sound_norm_refs=self.model.config.sound_norm_refs,
)
output = self.model.inference(
text=text,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=0.7,
length_penalty=float(self.model.config.length_penalty),
repetition_penalty=10.0,
top_k=int(self.model.config.top_k),
top_p=float(self.model.config.top_p),
enable_text_splitting=True,
)
torchaudio.save(output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000)
torch.cuda.empty_cache()
if len(text) > 700:
text_chunks = self.chunk_content(text, 200)
output_files = []
for chunk in text_chunks:
output = self.model.inference(
text=chunk,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=0.7,
length_penalty=float(self.model.config.length_penalty),
repetition_penalty=10.0,
top_k=int(self.model.config.top_k),
top_p=float(self.model.config.top_p),
enable_text_splitting=True,
)
output_file_name = f"{uuid.uuid4().hex}.wav"
output_file = os.path.join(self.output_folder, output_file_name)
torchaudio.save(
output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000
)
output_files.append(output_file)
torch.cuda.empty_cache()
combined_audio = AudioSegment.empty()
for file in output_files:
audio = AudioSegment.from_file(file)
combined_audio += audio
os.remove(file)
combined_audio.export(output_file, format="wav")
else:
output = self.model.inference(
text=text,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=0.7,
length_penalty=float(self.model.config.length_penalty),
repetition_penalty=10.0,
top_k=int(self.model.config.top_k),
top_p=float(self.model.config.top_p),
enable_text_splitting=True,
)
torchaudio.save(
output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000
)
torch.cuda.empty_cache()
if local_uri:
return f"{local_uri}/outputs/{output_file_name}"
with open(output_file, "rb") as file:
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@ pyngrok==7.1.5
accelerate==0.27.2
python-multipart
llama-cpp-python==0.2.55
openai
openai
pdfplumber
spacy