Skip to content

Commit

Permalink
Support for UI of MultimodalRAGWithVideos in GenAIExamples (opea-proj…
Browse files Browse the repository at this point in the history
…ect#656)

* using png instead of jpg due to tgi-gaudi. return message include video_id maps

Signed-off-by: Tiep Le <[email protected]>

* update lvm and lvm_tgi to handle empty retrieval results

Signed-off-by: Tiep Le <[email protected]>

* handle no retrieval results

Signed-off-by: Tiep Le <[email protected]>

* add metadata to textdoc returned by LVM

Signed-off-by: Tiep Le <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add metadatatextdoc

Signed-off-by: Tiep Le <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

Signed-off-by: Tiep Le <[email protected]>

* add metadata to Chatcompletionresponsechoice

Signed-off-by: Tiep Le <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

Signed-off-by: Tiep Le <[email protected]>

* fix bug

Signed-off-by: Tiep Le <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

Signed-off-by: Tiep Le <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix for more consice

Signed-off-by: Tiep Le <[email protected]>

* update port for test

Signed-off-by: Tiep Le <[email protected]>

* update test

Signed-off-by: Tiep Le <[email protected]>

---------

Signed-off-by: Tiep Le <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: kevinintel <[email protected]>
Co-authored-by: Sihan Chen <[email protected]>
Co-authored-by: chen, suyue <[email protected]>
  • Loading branch information
5 people authored Sep 11, 2024
1 parent 8e3f553 commit 7664578
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 22 deletions.
1 change: 1 addition & 0 deletions comps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
LVMSearchedMultimodalDoc,
RerankedDoc,
TextDoc,
MetadataTextDoc,
RAGASParams,
RAGASScores,
GraphDoc,
Expand Down
18 changes: 17 additions & 1 deletion comps/cores/mega/gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,14 +777,30 @@ async def handle_request(self, request: Request):
):
return response
last_node = runtime_graph.all_leaves()[-1]
response = result_dict[last_node]["text"]

if "text" in result_dict[last_node].keys():
response = result_dict[last_node]["text"]
else:
# text in not response message
# something wrong, for example due to empty retrieval results
if "detail" in result_dict[last_node].keys():
response = result_dict[last_node]["detail"]
else:
response = "The server fail to generate answer to your query!"
if "metadata" in result_dict[last_node].keys():
# from retrieval results
metadata = result_dict[last_node]["metadata"]
else:
# follow-up question, no retrieval
metadata = None
choices = []
usage = UsageInfo()
choices.append(
ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop",
metadata=metadata,
)
)
return ChatCompletionResponse(model="multimodalragwithvideos", choices=choices, usage=usage)
1 change: 1 addition & 0 deletions comps/cores/proto/api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Optional[Literal["stop", "length"]] = None
metadata: Optional[Dict[str, Any]] = None


class ChatCompletionResponse(BaseModel):
Expand Down
7 changes: 7 additions & 0 deletions comps/cores/proto/docarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ class TextDoc(BaseDoc, TopologyInfo):
text: str = None


class MetadataTextDoc(TextDoc):
metadata: Optional[Dict[str, Any]] = Field(
description="This encloses all metadata associated with the textdoc.",
default=None,
)


class ImageDoc(BaseDoc):
url: Optional[ImageUrl] = Field(
description="The path to the image. It can be remote (Web) URL, or a local file path",
Expand Down
10 changes: 5 additions & 5 deletions comps/dataprep/multimodal/redis/langchain/multimodal_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ def str2time(strtime: str):

def convert_img_to_base64(image):
"Convert image to base64 string"
_, buffer = cv2.imencode(".jpg", image)
_, buffer = cv2.imencode(".png", image)
encoded_string = base64.b64encode(buffer)
return encoded_string.decode()


def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: str, vtt_path: str, output_dir: str):
"""Extract frames (.jpg) and annotations (.json) from video file (.mp4) and captions file (.vtt)"""
"""Extract frames (.png) and annotations (.json) from video file (.mp4) and captions file (.vtt)"""
# Set up location to store frames and annotations
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
Expand Down Expand Up @@ -157,7 +157,7 @@ def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: s
if success:
# Save frame for further processing
img_fname = f"frame_{idx}"
img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg")
img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
cv2.imwrite(img_fpath, frame)

# Convert image to base64 encoded string
Expand Down Expand Up @@ -195,7 +195,7 @@ def use_lvm(endpoint: str, img_b64_string: str, prompt: str = "Provide a short d
def extract_frames_and_generate_captions(
video_id: str, video_path: str, lvm_endpoint: str, output_dir: str, key_frame_per_second: int = 1
):
"""Extract frames (.jpg) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice."""
"""Extract frames (.png) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice."""
# Set up location to store frames and annotations
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
Expand Down Expand Up @@ -225,7 +225,7 @@ def extract_frames_and_generate_captions(

# Save frame for further processing
img_fname = f"frame_{idx}"
img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg")
img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
cv2.imwrite(img_fpath, frame)

# Convert image to base64 encoded string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def prepare_data_and_metadata_from_annotation(
metadatas = []
for i, frame in enumerate(annotation):
frame_index = frame["sub_video_id"]
path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.jpg")
path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.png")
# augment this frame's transcript with a reasonable number of neighboring frames' transcripts helps semantic retrieval
lb_ingesting = max(0, i - num_transcript_concat_for_ingesting)
ub_ingesting = min(len(annotation), i + num_transcript_concat_for_ingesting + 1)
Expand Down Expand Up @@ -275,6 +275,7 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None

if files:
video_files = []
uploaded_videos_saved_videos_map = {}
for file in files:
if os.path.splitext(file.filename)[1] == ".mp4":
video_files.append(file)
Expand All @@ -299,6 +300,8 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
with open(os.path.join(upload_folder, video_file_name), "wb") as f:
shutil.copyfileobj(video_file.file, f)

uploaded_videos_saved_videos_map[video_name] = video_file_name

# Extract temporary audio wav file from video mp4
audio_file = video_dir_name + ".wav"
print(f"Extracting {audio_file}")
Expand Down Expand Up @@ -345,7 +348,11 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
end = time.time()
print(str(end - st))

return {"status": 200, "message": "Data preparation succeeded"}
return {
"status": 200,
"message": "Data preparation succeeded",
"video_id_maps": uploaded_videos_saved_videos_map,
}

raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.")

Expand All @@ -358,6 +365,7 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):

if files:
video_files = []
uploaded_videos_saved_videos_map = {}
for file in files:
if os.path.splitext(file.filename)[1] == ".mp4":
video_files.append(file)
Expand All @@ -380,6 +388,7 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
# Save video file in upload_directory
with open(os.path.join(upload_folder, video_file_name), "wb") as f:
shutil.copyfileobj(video_file.file, f)
uploaded_videos_saved_videos_map[video_name] = video_file_name

# Store frames and caption annotations in a new directory
extract_frames_and_generate_captions(
Expand All @@ -397,7 +406,11 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):

print(f"Processed video {video_file.filename}")

return {"status": 200, "message": "Data preparation succeeded"}
return {
"status": 200,
"message": "Data preparation succeeded",
"video_id_maps": uploaded_videos_saved_videos_map,
}

raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.")

Expand All @@ -413,6 +426,7 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
if files:
video_files, video_file_names = [], []
captions_files, captions_file_names = [], []
uploaded_videos_saved_videos_map = {}
for file in files:
if os.path.splitext(file.filename)[1] == ".mp4":
video_files.append(file)
Expand Down Expand Up @@ -451,6 +465,7 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
# Save video file in upload_directory
with open(os.path.join(upload_folder, video_file_name), "wb") as f:
shutil.copyfileobj(video_file.file, f)
uploaded_videos_saved_videos_map[video_name] = video_file_name

# Save captions file in upload directory
vtt_file_name = os.path.splitext(video_file.filename)[0] + ".vtt"
Expand Down Expand Up @@ -482,7 +497,11 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):

print(f"Processed video {video_file.filename}")

return {"status": 200, "message": "Data preparation succeeded"}
return {
"status": 200,
"message": "Data preparation succeeded",
"video_id_maps": uploaded_videos_saved_videos_map,
}

raise HTTPException(
status_code=400, detail="Must provide at least one pair consisting of video (.mp4) and captions (.vtt)"
Expand Down
1 change: 0 additions & 1 deletion comps/dataprep/multimodal/redis/langchain/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
docarray[full]
fastapi
langchain==0.1.12
langchain_benchmarks
moviepy
openai-whisper
opencv-python
Expand Down
20 changes: 18 additions & 2 deletions comps/lvms/llava/lvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
from typing import Union

import requests
from fastapi import HTTPException
from langchain_core.prompts import PromptTemplate
from template import ChatTemplate

from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
ServiceType,
TextDoc,
opea_microservices,
Expand All @@ -35,14 +37,20 @@
port=9399,
)
@register_statistics(names=["opea_service@lvm"])
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
if logflag:
logger.info(request)
start = time.time()
if isinstance(request, LVMSearchedMultimodalDoc):
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
# there is no video segments retrieved.
# Raise HTTPException status_code 204
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")

img_b64_str = retrieved_metadatas[0]["b64_img_str"]
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
Expand Down Expand Up @@ -75,7 +83,15 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
result = response.json()["text"]
if logflag:
logger.info(result)
return TextDoc(text=result)
if isinstance(request, LVMSearchedMultimodalDoc):
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return MetadataTextDoc(text=result, metadata=return_metadata)
else:
return TextDoc(text=result)


if __name__ == "__main__":
Expand Down
19 changes: 17 additions & 2 deletions comps/lvms/tgi-llava/lvm_tgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
from typing import Union

from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from huggingface_hub import AsyncInferenceClient
from langchain_core.prompts import PromptTemplate
Expand All @@ -14,6 +15,7 @@
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
ServiceType,
TextDoc,
opea_microservices,
Expand All @@ -36,7 +38,7 @@
output_datatype=TextDoc,
)
@register_statistics(names=["opea_service@lvm_tgi"])
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
if logflag:
logger.info(request)
start = time.time()
Expand All @@ -46,6 +48,11 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if not retrieved_metadatas or len(retrieved_metadatas) == 0:
# there is no video segments retrieved.
# Raise HTTPException status_code 204
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
Expand Down Expand Up @@ -121,7 +128,15 @@ async def stream_generator():
statistics_dict["opea_service@lvm_tgi"].append_latency(time.time() - start, None)
if logflag:
logger.info(generated_str)
return TextDoc(text=generated_str)
if isinstance(request, LVMSearchedMultimodalDoc):
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return MetadataTextDoc(text=generated_str, metadata=return_metadata)
else:
return TextDoc(text=generated_str)


if __name__ == "__main__":
Expand Down
13 changes: 6 additions & 7 deletions tests/lvms/test_lvms_llava.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,15 @@ function build_docker_images() {

function start_service() {
unset http_proxy
llava_port=5071
lvm_port=5072
docker run -d --name="test-comps-lvm-llava-dependency" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:8399 --ipc=host opea/llava:comps
docker run -d --name="test-comps-lvm-llava-server" -e LVM_ENDPOINT=http://$ip_address:$llava_port -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm:comps
lvm_port=5051
docker run -d --name="test-comps-lvm-llava-dependency" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5028:8399 --ipc=host opea/llava:comps
docker run -d --name="test-comps-lvm-llava-server" -e LVM_ENDPOINT=http://$ip_address:5028 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm:comps
sleep 8m
}

function validate_microservice() {

lvm_port=5072
lvm_port=5051
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json')
if [[ $result == *"yellow"* ]]; then
echo "Result correct."
Expand All @@ -49,7 +48,7 @@ function validate_microservice() {
exit 1
fi

result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image"}]}' -H 'Content-Type: application/json')
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}]}' -H 'Content-Type: application/json')
if [[ $result == *"yellow"* ]]; then
echo "Result correct."
else
Expand All @@ -59,7 +58,7 @@ function validate_microservice() {
exit 1
fi

result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' -H 'Content-Type: application/json')
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' -H 'Content-Type: application/json')
if [[ $result == *"yellow"* ]]; then
echo "Result correct."
else
Expand Down

0 comments on commit 7664578

Please sign in to comment.