From db19c2d7a4f89d361256dde540b69c9769676a77 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 9 Oct 2024 22:23:31 +0000
Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---
AvatarChatbot/avatarchatbot.py | 19 +-
.../docker_compose/intel/hpu/gaudi/README.md | 6 +-
AvatarChatbot/ui/gradio/app_gradio_demo.py | 259 ++++++++++--------
3 files changed, 165 insertions(+), 119 deletions(-)
diff --git a/AvatarChatbot/avatarchatbot.py b/AvatarChatbot/avatarchatbot.py
index 65d118cbd..bdfa5f0c8 100644
--- a/AvatarChatbot/avatarchatbot.py
+++ b/AvatarChatbot/avatarchatbot.py
@@ -23,7 +23,7 @@ def check_env_vars(env_var_list):
for var in env_var_list:
if not os.getenv(var):
print(f"Error: The environment variable '{var}' is not set.")
- sys.exit(1) # Exit the program with a non-zero status code
+ sys.exit(1) # Exit the program with a non-zero status code
print("All environment variables are set.")
@@ -74,7 +74,20 @@ def add_remote_service(self):
if __name__ == "__main__":
- check_env_vars([MEGA_SERVICE_HOST_IP, MEGA_SERVICE_PORT, ASR_SERVICE_HOST_IP, ASR_SERVICE_PORT, LLM_SERVICE_HOST_IP, LLM_SERVICE_PORT, TTS_SERVICE_HOST_IP, TTS_SERVICE_PORT, ANIMATION_SERVICE_HOST_IP, ANIMATION_SERVICE_PORT])
-
+ check_env_vars(
+ [
+ MEGA_SERVICE_HOST_IP,
+ MEGA_SERVICE_PORT,
+ ASR_SERVICE_HOST_IP,
+ ASR_SERVICE_PORT,
+ LLM_SERVICE_HOST_IP,
+ LLM_SERVICE_PORT,
+ TTS_SERVICE_HOST_IP,
+ TTS_SERVICE_PORT,
+ ANIMATION_SERVICE_HOST_IP,
+ ANIMATION_SERVICE_PORT,
+ ]
+ )
+
avatarchatbot = AvatarChatbotService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
avatarchatbot.add_remote_service()
diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
index 9623757b9..b9a8dc8a3 100644
--- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
+++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
@@ -23,7 +23,7 @@ flowchart LR
classDef invisible fill:transparent,stroke:transparent;
style AvatarChatbot-Megaservice stroke:#000000
- %% Subgraphs %%
+ %% Subgraphs %%
subgraph AvatarChatbot-Megaservice["AvatarChatbot Megaservice"]
direction LR
ASR([ASR
3001]):::blue
@@ -43,7 +43,7 @@ flowchart LR
invis2[ ]:::invisible
GW([AvatarChatbot GateWay
]):::orange
end
- subgraph
+ subgraph
direction LR
X([OPEA Microservice]):::blue
Y{{Open Source Service}}:::thistle
@@ -51,7 +51,7 @@ flowchart LR
Z1([UI]):::orchid
end
- %% Services %%
+ %% Services %%
WHISPER{{Whisper service
7066}}:::thistle
TGI{{LLM service
3006}}:::thistle
T5{{Speecht5 service
7055}}:::thistle
diff --git a/AvatarChatbot/ui/gradio/app_gradio_demo.py b/AvatarChatbot/ui/gradio/app_gradio_demo.py
index a4f92fd0a..1f1cea566 100755
--- a/AvatarChatbot/ui/gradio/app_gradio_demo.py
+++ b/AvatarChatbot/ui/gradio/app_gradio_demo.py
@@ -1,24 +1,26 @@
-import aiohttp
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
import asyncio
import base64
-import gradio as gr
import io
-import numpy as np
import os
-from PIL import Image
-import requests
+import pdb
import shutil
-import soundfile as sf
import subprocess
import time
-import pdb
+import aiohttp
+import gradio as gr
+import numpy as np
+import requests
+import soundfile as sf
+from PIL import Image
-#%% AudioQnA functions
+
+# %% AudioQnA functions
def preprocess_audio(audio):
- """
- The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,)
- """
+ """The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,)"""
sr, y = audio
# Convert to normalized float32 audio
y = y.astype(np.float32)
@@ -26,7 +28,7 @@ def preprocess_audio(audio):
# Convert the normalized float32 audio to a WAV file in memory
buf = io.BytesIO()
sf.write(buf, y, sr, format="WAV")
- buf.seek(0) # Reset the buffer position to the beginning
+ buf.seek(0) # Reset the buffer position to the beginning
# Encode the WAV file to base64 string
base64_bytes = base64.b64encode(buf.read())
base64_string = base64_bytes.decode("utf-8")
@@ -62,7 +64,7 @@ async def transcribe(audio_input):
async with aiohttp.ClientSession() as session:
async with session.post(ai_chatbot_url, json=initial_inputs) as response:
- # response = requests.post(ai_chatbot_url, json=initial_inputs)
+ # response = requests.post(ai_chatbot_url, json=initial_inputs)
# Check the response status code
if response.status == 200:
@@ -81,21 +83,23 @@ async def transcribe(audio_input):
chat_ai = chat_ai[: last_punc_idx + 1]
chat_history += f"AI: {chat_ai}"
chat_history = chat_history.replace("OPEX", "OPEA")
- return (sampling_rate, audio_int16) # handle the response
+ return (sampling_rate, audio_int16) # handle the response
else:
return {"error": "Failed to transcribe audio", "status_code": response.status_code}
+
def resize_image(image_pil, size=(720, 720)):
- """Resize the image to the specified size"""
+ """Resize the image to the specified size."""
return image_pil.resize(size, Image.LANCZOS)
+
def resize_video(video_path, save_path, size=(720, 1280)):
- """Resize the video to the specified size"""
- command_resize_video = f'ffmpeg -y -i {video_path} -vf scale={size[0]}:{size[1]} {save_path}'
+ """Resize the video to the specified size."""
+ command_resize_video = f"ffmpeg -y -i {video_path} -vf scale={size[0]}:{size[1]} {save_path}"
subprocess.run(command_resize_video, shell=True)
-#%% Wav2Lip functions
+# %% Wav2Lip functions
async def gen_video(image, audio, model_choice):
"""Input: image (saved .png path), ai audio (saved .wav path); Output: video"""
# 0. Preprocess audio
@@ -107,33 +111,33 @@ async def gen_video(image, audio, model_choice):
# 1. Set environment variables
match model_choice:
case "wav2lip":
- os.environ['INFERENCE_MODE'] = 'wav2lip_only'
- os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip.pth'
+ os.environ["INFERENCE_MODE"] = "wav2lip_only"
+ os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip.pth"
case "wav2lip+GAN":
- os.environ['INFERENCE_MODE'] = 'wav2lip_only'
- os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip_gan.pth'
+ os.environ["INFERENCE_MODE"] = "wav2lip_only"
+ os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip_gan.pth"
case "wav2lip+GFPGAN":
- os.environ['INFERENCE_MODE'] = 'wav2lip+gfpgan'
- os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip.pth'
+ os.environ["INFERENCE_MODE"] = "wav2lip+gfpgan"
+ os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip.pth"
# os.environ['INFERENCE_MODE'] = 'wav2lip_only'
# os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip_gan.pth'
- os.environ['FACE'] = image # path to either an image or a video
- os.environ['AUDIO'] = output_audio_save_path # path to .wav audio
+ os.environ["FACE"] = image # path to either an image or a video
+ os.environ["AUDIO"] = output_audio_save_path # path to .wav audio
# os.environ['AUDIO'] = audio
- os.environ['FACESIZE'] = '96'
- os.environ['OUTFILE'] = 'outputs/result6.mp4'
- os.environ['GFPGAN_MODEL_VERSION'] = '1.3'
- os.environ['UPSCALE_FACTOR'] = '1' # int
+ os.environ["FACESIZE"] = "96"
+ os.environ["OUTFILE"] = "outputs/result6.mp4"
+ os.environ["GFPGAN_MODEL_VERSION"] = "1.3"
+ os.environ["UPSCALE_FACTOR"] = "1" # int
# os.environ['FPS'] = '25.' # can be lower (e.g., 10)
- os.environ['FPS'] = '10.' # can be lower when using an image (e.g., 10)
-
+ os.environ["FPS"] = "10." # can be lower when using an image (e.g., 10)
+
# 2. Run inference.sh bash script to perform Wav2Lip+GFPGAN inference
# Output video is saved at the path 'OUTFILE'
- command_wav2lip_gfpgan = 'bash inference_vars.sh'
+ command_wav2lip_gfpgan = "bash inference_vars.sh"
subprocess.run(command_wav2lip_gfpgan, shell=True)
-
- outfile = os.environ.get('OUTFILE')
+
+ outfile = os.environ.get("OUTFILE")
if os.path.exists(outfile):
res_video = outfile
else:
@@ -141,14 +145,14 @@ async def gen_video(image, audio, model_choice):
return res_video
-#%% AI Avatar demo function
+# %% AI Avatar demo function
# ctao 7/19 - make it asynchronous
async def aiavatar_demo(audio_input):
"""Input: mic audio, image; Output: ai audio, text, text, ai video"""
# Include AudioQnA
- output_audio = await transcribe(audio_input) # AudioQnA
+ output_audio = await transcribe(audio_input) # AudioQnA
- if isinstance(output_audio, dict): # in case of an error
+ if isinstance(output_audio, dict): # in case of an error
return None, None
else:
sr, audio_int16 = output_audio
@@ -156,17 +160,17 @@ async def aiavatar_demo(audio_input):
sf.write(audio_file, audio_int16, sr)
# return audio_file, audio_file, image
return audio_file
-
+
async def final_update(audio, image, model_choice):
res_video = await gen_video(image, audio, model_choice)
return res_video
-#%% Main
+# %% Main
if __name__ == "__main__":
# HOST_IP = os.getenv("host_ip")
- HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode('utf-8').strip()
+ HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode("utf-8").strip()
# Fetch the AudioQnA backend server
ai_chatbot_url = f"http://{HOST_IP}:3008/v1/audioqna"
@@ -177,52 +181,61 @@ async def final_update(audio, image, model_choice):
# Prepare 3 image paths
# HOME = os.getenv("HOME")
# HOME="/mnt/localdisk4"
- HOME="/home/demo/"
- image_pils = [Image.open(os.path.join("../assets/img/woman1.png")),
- Image.open(os.path.join("../assets/img/man1.png")),
- Image.open(os.path.join("../assets/img/woman2.png"))]
-
- video_paths = [os.path.join("../assets/video/man1.mp4"),
- os.path.join("../assets/video/woman2.mp4"),
- os.path.join("../assets/video/man4.mp4")]
+ HOME = "/home/demo/"
+ image_pils = [
+ Image.open(os.path.join("../assets/img/woman1.png")),
+ Image.open(os.path.join("../assets/img/man1.png")),
+ Image.open(os.path.join("../assets/img/woman2.png")),
+ ]
+
+ video_paths = [
+ os.path.join("../assets/video/man1.mp4"),
+ os.path.join("../assets/video/woman2.mp4"),
+ os.path.join("../assets/video/man4.mp4"),
+ ]
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
- return base64.b64encode(image_file.read()).decode('utf-8')
+ return base64.b64encode(image_file.read()).decode("utf-8")
# Convert your images to Base64
# opea_qr_base64 = image_to_base64('../rfcs/opea_qr.png')
# opea_gh_qr_base64 = image_to_base64('../rfcs/opea_gh_qr.png')
- xeon_base64 = image_to_base64('../rfcs/xeon.jpg')
- gaudi_base64 = image_to_base64('../rfcs/gaudi.png')
+ xeon_base64 = image_to_base64("../rfcs/xeon.jpg")
+ gaudi_base64 = image_to_base64("../rfcs/gaudi.png")
# List of prerecorded WAV files containing audio questions
- audio_filepaths = ["../assets/audio/intel1.wav",
- "../assets/audio/intel2.wav",
- "../assets/audio/intel3.wav",
- "../assets/audio/intel4.wav",
- "../assets/audio/pnp1.wav",
- "../assets/audio/pnp2.wav",
- "../assets/audio/pnp3.wav",
- "../assets/audio/pnp4.wav",
- "../assets/audio/entertainment1.wav",
- "../assets/audio/entertainment2.wav"]
- audio_questions = ["1. What are the latest data center processor and AI accelerator products at Intel? Name them.",
- "2. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?",
- "3. What is Intel's Gaudi 3 AI Accelerator performance compared to Nvidia H100?",
- "4. What kinds of Intel AI tools are available to accelerate AI workloads?",
- "5. What is Plug and Play Technology Center? Where is it located?",
- "6. Tell us about inflation in the US in the past few years?",
- "7. What is the difference between an index fund and a mutual fund?",
- "8. What is the difference between pretax and roth retirement accounts?",
- "9. Which team won the Superbowl in 2022?",
- "10. In the Lord of the Rings, who threw the Ring into Mount Doom?"]
+ audio_filepaths = [
+ "../assets/audio/intel1.wav",
+ "../assets/audio/intel2.wav",
+ "../assets/audio/intel3.wav",
+ "../assets/audio/intel4.wav",
+ "../assets/audio/pnp1.wav",
+ "../assets/audio/pnp2.wav",
+ "../assets/audio/pnp3.wav",
+ "../assets/audio/pnp4.wav",
+ "../assets/audio/entertainment1.wav",
+ "../assets/audio/entertainment2.wav",
+ ]
+ audio_questions = [
+ "1. What are the latest data center processor and AI accelerator products at Intel? Name them.",
+ "2. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?",
+ "3. What is Intel's Gaudi 3 AI Accelerator performance compared to Nvidia H100?",
+ "4. What kinds of Intel AI tools are available to accelerate AI workloads?",
+ "5. What is Plug and Play Technology Center? Where is it located?",
+ "6. Tell us about inflation in the US in the past few years?",
+ "7. What is the difference between an index fund and a mutual fund?",
+ "8. What is the difference between pretax and roth retirement accounts?",
+ "9. Which team won the Superbowl in 2022?",
+ "10. In the Lord of the Rings, who threw the Ring into Mount Doom?",
+ ]
# Demo frontend
demo = gr.Blocks()
with demo:
# Define processing functions
count = 0
+
def initial_process(audio_input):
global count, chat_history
start_time = time.time()
@@ -249,7 +262,7 @@ def update_selected_image_state(image_index):
return f"inputs/face_{image_index}.png"
else:
return f"inputs/video_{image_index - len(image_pils)}.mp4"
-
+
def update_audio_input(audio_choice):
if audio_choice:
audio_index = int(audio_choice.split(".")[0]) - 1
@@ -257,17 +270,19 @@ def update_audio_input(audio_choice):
shutil.copyfile(audio_filepaths[audio_index], audio_filepath_gradio)
# audio_input.value = audio_filepath_gradio
return audio_filepath_gradio
-
+
# UI Components
# Title & Introduction
gr.Markdown("
Welcome to our AI Avatar Audio Chatbot! This application leverages PyTorch and OPEA (Open Platform for Enterprise AI) v0.8 to provide you with a human-like conversational experience. It's run on Intel® Gaudi® AI Accelerator and Intel® Xeon® Processor, with hardware and software optimizations.
Please feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.
OPEA megaservice deployed:
OPEA microservices deployed: +
OPEA microservices deployed:
OPEA's "AvatarChatbot" megaservice is composed of "ASR->LLM->TTS->Animation" microservices. It first generates an expert answer based on your query, and then animates the avatar figure with output audio. Feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.
+ """ + ) + #OPEA's "AvatarChatbot" megaservice is composed of "ASR->LLM->TTS->Animation" microservices. It first generates an expert answer based on your query, and then animates the avatar figure with output audio. Feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.
with gr.Row(): gr.Image("./flowchart_1.png", label="Megaservice Flowchart") with gr.Row(): - gr.Markdown(""" + gr.Markdown( + """The AI Avatar Audio Chatbot is powered by the following Intel® AI software:
Intel is committed to respecting human rights and avoiding complicity in human rights abuses. See Intel's Global Human Rights Principles. Intel's products and software are intended only to be used in applications that do not cause or contribute to a violation of an internationally recognized human right.
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.
You may not use or facilitate the use of this document in connection with any infringement or other legal analysis concerning Intel products described herein. You agree to grant Intel a non-exclusive, royalty-free license to any patent claim thereafter drafted which includes subject matter disclosed herein.