diff --git a/AvatarChatbot/avatarchatbot.py b/AvatarChatbot/avatarchatbot.py index 65d118cbd..bdfa5f0c8 100644 --- a/AvatarChatbot/avatarchatbot.py +++ b/AvatarChatbot/avatarchatbot.py @@ -23,7 +23,7 @@ def check_env_vars(env_var_list): for var in env_var_list: if not os.getenv(var): print(f"Error: The environment variable '{var}' is not set.") - sys.exit(1) # Exit the program with a non-zero status code + sys.exit(1) # Exit the program with a non-zero status code print("All environment variables are set.") @@ -74,7 +74,20 @@ def add_remote_service(self): if __name__ == "__main__": - check_env_vars([MEGA_SERVICE_HOST_IP, MEGA_SERVICE_PORT, ASR_SERVICE_HOST_IP, ASR_SERVICE_PORT, LLM_SERVICE_HOST_IP, LLM_SERVICE_PORT, TTS_SERVICE_HOST_IP, TTS_SERVICE_PORT, ANIMATION_SERVICE_HOST_IP, ANIMATION_SERVICE_PORT]) - + check_env_vars( + [ + MEGA_SERVICE_HOST_IP, + MEGA_SERVICE_PORT, + ASR_SERVICE_HOST_IP, + ASR_SERVICE_PORT, + LLM_SERVICE_HOST_IP, + LLM_SERVICE_PORT, + TTS_SERVICE_HOST_IP, + TTS_SERVICE_PORT, + ANIMATION_SERVICE_HOST_IP, + ANIMATION_SERVICE_PORT, + ] + ) + avatarchatbot = AvatarChatbotService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT) avatarchatbot.add_remote_service() diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md index 9623757b9..b9a8dc8a3 100644 --- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md +++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md @@ -23,7 +23,7 @@ flowchart LR classDef invisible fill:transparent,stroke:transparent; style AvatarChatbot-Megaservice stroke:#000000 - %% Subgraphs %% + %% Subgraphs %% subgraph AvatarChatbot-Megaservice["AvatarChatbot Megaservice"] direction LR ASR([ASR
3001]):::blue @@ -43,7 +43,7 @@ flowchart LR invis2[ ]:::invisible GW([AvatarChatbot GateWay
]):::orange end - subgraph + subgraph direction LR X([OPEA Microservice]):::blue Y{{Open Source Service}}:::thistle @@ -51,7 +51,7 @@ flowchart LR Z1([UI]):::orchid end - %% Services %% + %% Services %% WHISPER{{Whisper service
7066}}:::thistle TGI{{LLM service
3006}}:::thistle T5{{Speecht5 service
7055}}:::thistle diff --git a/AvatarChatbot/ui/gradio/app_gradio_demo.py b/AvatarChatbot/ui/gradio/app_gradio_demo.py index a4f92fd0a..1f1cea566 100755 --- a/AvatarChatbot/ui/gradio/app_gradio_demo.py +++ b/AvatarChatbot/ui/gradio/app_gradio_demo.py @@ -1,24 +1,26 @@ -import aiohttp +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import asyncio import base64 -import gradio as gr import io -import numpy as np import os -from PIL import Image -import requests +import pdb import shutil -import soundfile as sf import subprocess import time -import pdb +import aiohttp +import gradio as gr +import numpy as np +import requests +import soundfile as sf +from PIL import Image -#%% AudioQnA functions + +# %% AudioQnA functions def preprocess_audio(audio): - """ - The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,) - """ + """The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,)""" sr, y = audio # Convert to normalized float32 audio y = y.astype(np.float32) @@ -26,7 +28,7 @@ def preprocess_audio(audio): # Convert the normalized float32 audio to a WAV file in memory buf = io.BytesIO() sf.write(buf, y, sr, format="WAV") - buf.seek(0) # Reset the buffer position to the beginning + buf.seek(0) # Reset the buffer position to the beginning # Encode the WAV file to base64 string base64_bytes = base64.b64encode(buf.read()) base64_string = base64_bytes.decode("utf-8") @@ -62,7 +64,7 @@ async def transcribe(audio_input): async with aiohttp.ClientSession() as session: async with session.post(ai_chatbot_url, json=initial_inputs) as response: - # response = requests.post(ai_chatbot_url, json=initial_inputs) + # response = requests.post(ai_chatbot_url, json=initial_inputs) # Check the response status code if response.status == 200: @@ -81,21 +83,23 @@ async def transcribe(audio_input): chat_ai = chat_ai[: last_punc_idx + 1] chat_history += f"AI: {chat_ai}" chat_history = chat_history.replace("OPEX", "OPEA") - return (sampling_rate, audio_int16) # handle the response + return (sampling_rate, audio_int16) # handle the response else: return {"error": "Failed to transcribe audio", "status_code": response.status_code} + def resize_image(image_pil, size=(720, 720)): - """Resize the image to the specified size""" + """Resize the image to the specified size.""" return image_pil.resize(size, Image.LANCZOS) + def resize_video(video_path, save_path, size=(720, 1280)): - """Resize the video to the specified size""" - command_resize_video = f'ffmpeg -y -i {video_path} -vf scale={size[0]}:{size[1]} {save_path}' + """Resize the video to the specified size.""" + command_resize_video = f"ffmpeg -y -i {video_path} -vf scale={size[0]}:{size[1]} {save_path}" subprocess.run(command_resize_video, shell=True) -#%% Wav2Lip functions +# %% Wav2Lip functions async def gen_video(image, audio, model_choice): """Input: image (saved .png path), ai audio (saved .wav path); Output: video""" # 0. Preprocess audio @@ -107,33 +111,33 @@ async def gen_video(image, audio, model_choice): # 1. Set environment variables match model_choice: case "wav2lip": - os.environ['INFERENCE_MODE'] = 'wav2lip_only' - os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip.pth' + os.environ["INFERENCE_MODE"] = "wav2lip_only" + os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip.pth" case "wav2lip+GAN": - os.environ['INFERENCE_MODE'] = 'wav2lip_only' - os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip_gan.pth' + os.environ["INFERENCE_MODE"] = "wav2lip_only" + os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip_gan.pth" case "wav2lip+GFPGAN": - os.environ['INFERENCE_MODE'] = 'wav2lip+gfpgan' - os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip.pth' + os.environ["INFERENCE_MODE"] = "wav2lip+gfpgan" + os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip.pth" # os.environ['INFERENCE_MODE'] = 'wav2lip_only' # os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip_gan.pth' - os.environ['FACE'] = image # path to either an image or a video - os.environ['AUDIO'] = output_audio_save_path # path to .wav audio + os.environ["FACE"] = image # path to either an image or a video + os.environ["AUDIO"] = output_audio_save_path # path to .wav audio # os.environ['AUDIO'] = audio - os.environ['FACESIZE'] = '96' - os.environ['OUTFILE'] = 'outputs/result6.mp4' - os.environ['GFPGAN_MODEL_VERSION'] = '1.3' - os.environ['UPSCALE_FACTOR'] = '1' # int + os.environ["FACESIZE"] = "96" + os.environ["OUTFILE"] = "outputs/result6.mp4" + os.environ["GFPGAN_MODEL_VERSION"] = "1.3" + os.environ["UPSCALE_FACTOR"] = "1" # int # os.environ['FPS'] = '25.' # can be lower (e.g., 10) - os.environ['FPS'] = '10.' # can be lower when using an image (e.g., 10) - + os.environ["FPS"] = "10." # can be lower when using an image (e.g., 10) + # 2. Run inference.sh bash script to perform Wav2Lip+GFPGAN inference # Output video is saved at the path 'OUTFILE' - command_wav2lip_gfpgan = 'bash inference_vars.sh' + command_wav2lip_gfpgan = "bash inference_vars.sh" subprocess.run(command_wav2lip_gfpgan, shell=True) - - outfile = os.environ.get('OUTFILE') + + outfile = os.environ.get("OUTFILE") if os.path.exists(outfile): res_video = outfile else: @@ -141,14 +145,14 @@ async def gen_video(image, audio, model_choice): return res_video -#%% AI Avatar demo function +# %% AI Avatar demo function # ctao 7/19 - make it asynchronous async def aiavatar_demo(audio_input): """Input: mic audio, image; Output: ai audio, text, text, ai video""" # Include AudioQnA - output_audio = await transcribe(audio_input) # AudioQnA + output_audio = await transcribe(audio_input) # AudioQnA - if isinstance(output_audio, dict): # in case of an error + if isinstance(output_audio, dict): # in case of an error return None, None else: sr, audio_int16 = output_audio @@ -156,17 +160,17 @@ async def aiavatar_demo(audio_input): sf.write(audio_file, audio_int16, sr) # return audio_file, audio_file, image return audio_file - + async def final_update(audio, image, model_choice): res_video = await gen_video(image, audio, model_choice) return res_video -#%% Main +# %% Main if __name__ == "__main__": # HOST_IP = os.getenv("host_ip") - HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode('utf-8').strip() + HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode("utf-8").strip() # Fetch the AudioQnA backend server ai_chatbot_url = f"http://{HOST_IP}:3008/v1/audioqna" @@ -177,52 +181,61 @@ async def final_update(audio, image, model_choice): # Prepare 3 image paths # HOME = os.getenv("HOME") # HOME="/mnt/localdisk4" - HOME="/home/demo/" - image_pils = [Image.open(os.path.join("../assets/img/woman1.png")), - Image.open(os.path.join("../assets/img/man1.png")), - Image.open(os.path.join("../assets/img/woman2.png"))] - - video_paths = [os.path.join("../assets/video/man1.mp4"), - os.path.join("../assets/video/woman2.mp4"), - os.path.join("../assets/video/man4.mp4")] + HOME = "/home/demo/" + image_pils = [ + Image.open(os.path.join("../assets/img/woman1.png")), + Image.open(os.path.join("../assets/img/man1.png")), + Image.open(os.path.join("../assets/img/woman2.png")), + ] + + video_paths = [ + os.path.join("../assets/video/man1.mp4"), + os.path.join("../assets/video/woman2.mp4"), + os.path.join("../assets/video/man4.mp4"), + ] def image_to_base64(image_path): with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') + return base64.b64encode(image_file.read()).decode("utf-8") # Convert your images to Base64 # opea_qr_base64 = image_to_base64('../rfcs/opea_qr.png') # opea_gh_qr_base64 = image_to_base64('../rfcs/opea_gh_qr.png') - xeon_base64 = image_to_base64('../rfcs/xeon.jpg') - gaudi_base64 = image_to_base64('../rfcs/gaudi.png') + xeon_base64 = image_to_base64("../rfcs/xeon.jpg") + gaudi_base64 = image_to_base64("../rfcs/gaudi.png") # List of prerecorded WAV files containing audio questions - audio_filepaths = ["../assets/audio/intel1.wav", - "../assets/audio/intel2.wav", - "../assets/audio/intel3.wav", - "../assets/audio/intel4.wav", - "../assets/audio/pnp1.wav", - "../assets/audio/pnp2.wav", - "../assets/audio/pnp3.wav", - "../assets/audio/pnp4.wav", - "../assets/audio/entertainment1.wav", - "../assets/audio/entertainment2.wav"] - audio_questions = ["1. What are the latest data center processor and AI accelerator products at Intel? Name them.", - "2. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?", - "3. What is Intel's Gaudi 3 AI Accelerator performance compared to Nvidia H100?", - "4. What kinds of Intel AI tools are available to accelerate AI workloads?", - "5. What is Plug and Play Technology Center? Where is it located?", - "6. Tell us about inflation in the US in the past few years?", - "7. What is the difference between an index fund and a mutual fund?", - "8. What is the difference between pretax and roth retirement accounts?", - "9. Which team won the Superbowl in 2022?", - "10. In the Lord of the Rings, who threw the Ring into Mount Doom?"] + audio_filepaths = [ + "../assets/audio/intel1.wav", + "../assets/audio/intel2.wav", + "../assets/audio/intel3.wav", + "../assets/audio/intel4.wav", + "../assets/audio/pnp1.wav", + "../assets/audio/pnp2.wav", + "../assets/audio/pnp3.wav", + "../assets/audio/pnp4.wav", + "../assets/audio/entertainment1.wav", + "../assets/audio/entertainment2.wav", + ] + audio_questions = [ + "1. What are the latest data center processor and AI accelerator products at Intel? Name them.", + "2. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?", + "3. What is Intel's Gaudi 3 AI Accelerator performance compared to Nvidia H100?", + "4. What kinds of Intel AI tools are available to accelerate AI workloads?", + "5. What is Plug and Play Technology Center? Where is it located?", + "6. Tell us about inflation in the US in the past few years?", + "7. What is the difference between an index fund and a mutual fund?", + "8. What is the difference between pretax and roth retirement accounts?", + "9. Which team won the Superbowl in 2022?", + "10. In the Lord of the Rings, who threw the Ring into Mount Doom?", + ] # Demo frontend demo = gr.Blocks() with demo: # Define processing functions count = 0 + def initial_process(audio_input): global count, chat_history start_time = time.time() @@ -249,7 +262,7 @@ def update_selected_image_state(image_index): return f"inputs/face_{image_index}.png" else: return f"inputs/video_{image_index - len(image_pils)}.mp4" - + def update_audio_input(audio_choice): if audio_choice: audio_index = int(audio_choice.split(".")[0]) - 1 @@ -257,17 +270,19 @@ def update_audio_input(audio_choice): shutil.copyfile(audio_filepaths[audio_index], audio_filepath_gradio) # audio_input.value = audio_filepath_gradio return audio_filepath_gradio - + # UI Components # Title & Introduction gr.Markdown("

A PyTorch and OPEA based AI Avatar Audio Chatbot

") # gr.Markdown("# **Using OPEA to implement a RAG-Powered Human-Like AI Avatar Audio Chatbot**") with gr.Row(): with gr.Column(scale=8): - gr.Markdown(""" + gr.Markdown( + """

Welcome to our AI Avatar Audio Chatbot! This application leverages PyTorch and OPEA (Open Platform for Enterprise AI) v0.8 to provide you with a human-like conversational experience. It's run on Intel® Gaudi® AI Accelerator and Intel® Xeon® Processor, with hardware and software optimizations.
Please feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.

- """) + """ + ) with gr.Column(scale=1): # with gr.Row(): # gr.Markdown(f""" @@ -277,15 +292,20 @@ def update_audio_input(audio_choice): # OPEA GitHub QR Code # """, label="OPEA GitHub QR Code") with gr.Row(): - gr.Markdown(f""" - Intel®Gaudi""", label="Intel®Gaudi") - gr.Markdown(f""" - Intel®Xeon""", label="Intel®Xeon") + gr.Markdown( + f""" + Intel®Gaudi""", + label="Intel®Gaudi", + ) + gr.Markdown( + f""" + Intel®Xeon""", + label="Intel®Xeon", + ) gr.Markdown("
") # Divider - # Inputs - # Image gallary + # Image gallery selected_image_state = gr.State(value=-1) image_clicks = [] image_click_buttons = [] @@ -293,23 +313,27 @@ def update_audio_input(audio_choice): video_click_buttons = [] with gr.Row(): with gr.Column(scale=1): - audio_input = gr.Audio(sources=None, format="wav", label="🎤 or 📤 for your Intput audio!") + audio_input = gr.Audio(sources=None, format="wav", label="🎤 or 📤 for your Input audio!") audio_choice = gr.Dropdown( choices=audio_questions, label="Choose an audio question", - value=None, # default value + value=None, # default value ) # Update audio_input when a selection is made from the dropdown audio_choice.change(fn=update_audio_input, inputs=audio_choice, outputs=audio_input) - face_input = gr.File(file_count="single", file_types=["image", "video"], label="Choose an avatar or 📤 an image or video!") + face_input = gr.File( + file_count="single", + file_types=["image", "video"], + label="Choose an avatar or 📤 an image or video!", + ) model_choice = gr.Dropdown( choices=["wav2lip", "wav2lip+GAN", "wav2lip+GFPGAN"], label="Choose a DL model", ) with gr.Column(scale=2): # Display 3 images and buttons - with gr.Row(): + with gr.Row(): for i, image_pil in enumerate(image_pils): image_pil = resize_image(image_pil) save_path = f"inputs/face_{i}.png" @@ -328,41 +352,44 @@ def update_audio_input(audio_choice): with gr.Row(): for i in range(len(video_paths)): video_click_buttons.append(gr.Button(f"Use Video {i+1}")) - + submit_button = gr.Button("Submit") - + # Outputs gr.Markdown("
") # Divider with gr.Row(): with gr.Column(scale=1): audio_output_interm = gr.Audio(label="🔊 Output audio", autoplay=True) chat_history_box = gr.Textbox(label="Chat History", value=chat_history) - audio_time_text = gr.Textbox(label=f"Audio processing time", value="0.0 seconds") + audio_time_text = gr.Textbox(label="Audio processing time", value="0.0 seconds") with gr.Column(scale=2): video_output = gr.Video(label="Your AI Avatar video: ", format="mp4", width=1280, height=720) - video_time_text = gr.Textbox(label=f"Video processing time", value="0.0 seconds") + video_time_text = gr.Textbox(label="Video processing time", value="0.0 seconds") # Technical details gr.Markdown("
") # Divider with gr.Row(): - gr.Markdown(""" + gr.Markdown( + """

OPEA megaservice deployed:

-

OPEA microservices deployed: +

OPEA microservices deployed:

- """) - #

OPEA's "AvatarChatbot" megaservice is composed of "ASR->LLM->TTS->Animation" microservices. It first generates an expert answer based on your query, and then animates the avatar figure with output audio. Feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.

+ """ + ) + #

OPEA's "AvatarChatbot" megaservice is composed of "ASR->LLM->TTS->Animation" microservices. It first generates an expert answer based on your query, and then animates the avatar figure with output audio. Feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.

with gr.Row(): gr.Image("./flowchart_1.png", label="Megaservice Flowchart") with gr.Row(): - gr.Markdown(""" + gr.Markdown( + """

The AI Avatar Audio Chatbot is powered by the following Intel® AI software:

- """) - + """ + ) + # Disclaimer gr.Markdown("
") # Divider gr.Markdown("

Notices & Disclaimers

") - gr.Markdown(""" + gr.Markdown( + """

Intel is committed to respecting human rights and avoiding complicity in human rights abuses. See Intel's Global Human Rights Principles. Intel's products and software are intended only to be used in applications that do not cause or contribute to a violation of an internationally recognized human right.

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

You may not use or facilitate the use of this document in connection with any infringement or other legal analysis concerning Intel products described herein. You agree to grant Intel a non-exclusive, royalty-free license to any patent claim thereafter drafted which includes subject matter disclosed herein.

- """) - + """ + ) + # States interm_state = gr.State(value="initial") - + # State transitions for i in range(len(image_pils)): - image_click_buttons[i].click(update_selected_image_state, - inputs=[gr.Number(value=i, visible=False)], - outputs=[face_input]) + image_click_buttons[i].click( + update_selected_image_state, inputs=[gr.Number(value=i, visible=False)], outputs=[face_input] + ) for i in range(len(video_paths)): - video_click_buttons[i].click(update_selected_image_state, - inputs=[gr.Number(value=i+len(image_pils), visible=False)], - outputs=[face_input]) + video_click_buttons[i].click( + update_selected_image_state, + inputs=[gr.Number(value=i + len(image_pils), visible=False)], + outputs=[face_input], + ) # submit_button = gr.Button("Submit") submit_button.click( initial_process, inputs=[audio_input], - outputs=[audio_output_interm, - interm_state, - audio_time_text, - chat_history_box], # need to change interm_state + outputs=[ + audio_output_interm, + interm_state, + audio_time_text, + chat_history_box, + ], # need to change interm_state ) interm_state.change( final_process, inputs=[audio_output_interm, face_input, model_choice], - outputs=[video_output, - video_time_text], + outputs=[video_output, video_time_text], ) demo.queue().launch(server_name="0.0.0.0", server_port=7861)