From b469976c37e83027e771ed04e6a277a5c283dfd9 Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Mon, 2 Sep 2024 20:09:27 -0700 Subject: [PATCH 01/29] dataprep service Signed-off-by: srinarayan-srikanthan --- comps/dataprep/vdms/README.md | 189 ++++++++++++++++++ comps/dataprep/vdms/langchain/__init__.py | 2 + comps/dataprep/vdms/langchain/config.py | 33 +++ .../dataprep/vdms/langchain/docker/Dockerfile | 35 ++++ .../docker/docker-compose-dataprep-vdms.yaml | 28 +++ .../vdms/langchain/prepare_doc_vdms.py | 81 ++++++++ .../dataprep/vdms/langchain/requirements.txt | 39 ++++ .../vdms/multimodal_langchain/__init__.py | 2 + .../vdms/multimodal_langchain/config.yaml | 30 +++ .../multimodal_langchain/docker/Dockerfile | 40 ++++ .../docker/docker-compose-dataprep-vdms.yaml | 28 +++ .../multimodal_langchain/ingest_videos.py | 112 +++++++++++ .../multimodal_langchain/requirements.txt | 39 ++++ .../utils/store_embeddings.py | 121 +++++++++++ .../vdms/multimodal_langchain/utils/utils.py | 119 +++++++++++ .../vdms/multimodal_langchain/utils/vclip.py | 58 ++++++ 16 files changed, 956 insertions(+) create mode 100644 comps/dataprep/vdms/README.md create mode 100644 comps/dataprep/vdms/langchain/__init__.py create mode 100644 comps/dataprep/vdms/langchain/config.py create mode 100644 comps/dataprep/vdms/langchain/docker/Dockerfile create mode 100644 comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml create mode 100644 comps/dataprep/vdms/langchain/prepare_doc_vdms.py create mode 100644 comps/dataprep/vdms/langchain/requirements.txt create mode 100644 comps/dataprep/vdms/multimodal_langchain/__init__.py create mode 100644 comps/dataprep/vdms/multimodal_langchain/config.yaml create mode 100644 comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile create mode 100644 comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml create mode 100644 comps/dataprep/vdms/multimodal_langchain/ingest_videos.py create mode 100644 comps/dataprep/vdms/multimodal_langchain/requirements.txt create mode 100644 comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py create mode 100644 comps/dataprep/vdms/multimodal_langchain/utils/utils.py create mode 100644 comps/dataprep/vdms/multimodal_langchain/utils/vclip.py diff --git a/comps/dataprep/vdms/README.md b/comps/dataprep/vdms/README.md new file mode 100644 index 000000000..617761f02 --- /dev/null +++ b/comps/dataprep/vdms/README.md @@ -0,0 +1,189 @@ +# Dataprep Microservice with VDMS + +For dataprep microservice, we currently provide one framework: `Langchain`. + + + +We organized the folders in the same way, so you can use either framework for dataprep microservice with the following constructions. + +# 🚀1. Start Microservice with Python (Option 1) + +## 1.1 Install Requirements + +- option 1: Install Single-process version (for 1-10 files processing) + +```bash +apt-get update +apt-get install -y default-jre tesseract-ocr libtesseract-dev poppler-utils +cd langchain +pip install -r requirements.txt +``` + + + +## 1.2 Start VDMS Server + +Please refer to this [readme](../../vectorstores/langchain/vdms/README.md). + +## 1.3 Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export VDMS_HOST=${host_ip} +export VDMS_PORT=55555 +export COLLECTION_NAME=${your_collection_name} +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep" +export PYTHONPATH=${path_to_comps} +``` + +## 1.4 Start Document Preparation Microservice for VDMS with Python Script + +Start document preparation microservice for VDMS with below command. + +- option 1: Start single-process version (for 1-10 files processing) + +```bash +python prepare_doc_vdms.py +``` + + + +# 🚀2. Start Microservice with Docker (Option 2) + +## 2.1 Start VDMS Server + +Please refer to this [readme](../../vectorstores/langchain/vdms/README.md). + +## 2.2 Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export VDMS_HOST=${host_ip} +export VDMS_PORT=55555 +export TEI_ENDPOINT=${your_tei_endpoint} +export COLLECTION_NAME=${your_collection_name} +export SEARCH_ENGINE="FaissFlat" +export DISTANCE_STRATEGY="L2" +export PYTHONPATH=${path_to_comps} +``` + +## 2.3 Build Docker Image + +- Build docker image with langchain + +* option 1: Start single-process version (for 1-10 files processing) + +```bash +cd ../../../ +docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile . +``` + + + +## 2.4 Run Docker with CLI + +- option 1: Start single-process version (for 1-10 files processing) + +```bash +docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \ +-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_ENDPOINT=$TEI_ENDPOINT \ +-e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \ +opea/dataprep-vdms:latest +``` + + + +# 🚀3. Status Microservice + +```bash +docker container logs -f dataprep-vdms-server +``` + +# 🚀4. Consume Microservice + +Once document preparation microservice for VDMS is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + + diff --git a/comps/dataprep/vdms/langchain/__init__.py b/comps/dataprep/vdms/langchain/__init__.py new file mode 100644 index 000000000..4582b4f9a --- /dev/null +++ b/comps/dataprep/vdms/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/comps/dataprep/vdms/langchain/config.py b/comps/dataprep/vdms/langchain/config.py new file mode 100644 index 000000000..3e3e06a16 --- /dev/null +++ b/comps/dataprep/vdms/langchain/config.py @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + + +def getEnv(key, default_value=None): + env_value = os.getenv(key, default=default_value) + print(f"{key}: {env_value}") + return env_value + + +# Embedding model +EMBED_MODEL = getEnv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# VDMS configuration +VDMS_HOST = getEnv("VDMS_HOST", "localhost") +VDMS_PORT = int(getEnv("VDMS_PORT", 55555)) +COLLECTION_NAME = getEnv("COLLECTION_NAME", "rag-vdms") +SEARCH_ENGINE = getEnv("SEARCH_ENGINE", "FaissFlat") +DISTANCE_STRATEGY = getEnv("DISTANCE_STRATEGY", "L2") + +# LLM/Embedding endpoints +TGI_LLM_ENDPOINT = getEnv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_ENDPOINT_NO_RAG = getEnv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") +TEI_EMBEDDING_ENDPOINT = getEnv("TEI_ENDPOINT") + +# chunk parameters +CHUNK_SIZE = getEnv("CHUNK_SIZE", 1500) +CHUNK_OVERLAP = getEnv("CHUNK_OVERLAP", 100) + +current_file_path = os.path.abspath(__file__) +parent_dir = os.path.dirname(current_file_path) \ No newline at end of file diff --git a/comps/dataprep/vdms/langchain/docker/Dockerfile b/comps/dataprep/vdms/langchain/docker/Dockerfile new file mode 100644 index 000000000..606b0a4e1 --- /dev/null +++ b/comps/dataprep/vdms/langchain/docker/Dockerfile @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libcairo2-dev \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/vdms/langchain/requirements.txt + +ENV PYTHONPATH=/home/user + +USER user + +WORKDIR /home/user/comps/dataprep/vdms/langchain + +ENTRYPOINT ["python", "prepare_doc_vdms.py"] \ No newline at end of file diff --git a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml new file mode 100644 index 000000000..edb733c7d --- /dev/null +++ b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + vdms-vector-db: + image: intellabs/vdms:latest + container_name: vdms-vector-db + ports: + - "55555:55555" + dataprep-vdms: + image: opea/dataprep-vdms:latest + container_name: dataprep-vdms-server + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + VDMS_HOST: ${VDMS_HOST} + VDMS_PORT: ${VDMS_PORT} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge \ No newline at end of file diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py new file mode 100644 index 000000000..13591dd26 --- /dev/null +++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py @@ -0,0 +1,81 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores.vdms import VDMS, VDMS_Client +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import DocPath, opea_microservices, opea_telemetry, register_microservice +from comps.dataprep.utils import document_loader, get_separators, get_tables_result + +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") +client = VDMS_Client(VDMS_HOST, int(VDMS_PORT)) + + +@register_microservice( + name="opea_service@prepare_doc_vdms", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +@opea_telemetry +def ingest_documents(doc_path: DocPath): + """Ingest document to VDMS.""" + path = doc_path.path + print(f"Parsing document {doc_path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() + ) + + content = document_loader(doc_path) + chunks = text_splitter.split_text(content) + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = batch_chunks + + _ = VDMS.from_texts( + client=client, + embedding=embedder, + collection_name=COLLECTION_NAME, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + texts=batch_texts, + ) + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_vdms"].start() \ No newline at end of file diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt new file mode 100644 index 000000000..859dec9f9 --- /dev/null +++ b/comps/dataprep/vdms/langchain/requirements.txt @@ -0,0 +1,39 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-core +langchain-text-splitters +langsmith +markdown +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +python-bidi==0.4.2 +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.11.5 +uvicorn +vdms +tqdm +tzlocal +opencv-python +tqdm +tzlocal +PyYAML +typing +decord +einops \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/__init__.py b/comps/dataprep/vdms/multimodal_langchain/__init__.py new file mode 100644 index 000000000..4582b4f9a --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml new file mode 100644 index 000000000..b164d263a --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -0,0 +1,30 @@ +# Path to all videos +videos: video_ingest/videos/ +# Do you want to extract frames of videos (True if not done already, else False) +generate_frames: True +# How do you want to generate feature embeddings? +embeddings: + type: 'video' + vclip_model_name: "openai/clip-vit-base-patch32" + vclip_num_frm: 64 + path: 'video_ingest/embeddings' +# VL-branch config +vl_branch: + cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml + model_type: 'llama_v2' +# Path to store metadata files +meta_output_dir: video_ingest/video_metadata/ +# Chunk duration defines the interval of time that each embedding will occur +chunk_duration: 30 +# Clip duration defines the length of the interval in which the embeding will occur +clip_duration: 10 +# e.g. For every , you embed the first 's frames of that interval + +vector_db: + choice_of_db: 'vdms' # #Supported databases [vdms] + host: 0.0.0.0 + port: 55555 + + +# LLM path +model_path: meta-llama/Llama-2-7b-chat-hf \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile new file mode 100644 index 000000000..505448a77 --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile @@ -0,0 +1,40 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libcairo2-dev \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/vdms/multimodal_langchain/requirements.txt + +ENV PYTHONPATH=/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/vdms/multimodal_langchain + +ENTRYPOINT ["python", "ingest_videos.py"] \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml new file mode 100644 index 000000000..edb733c7d --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + vdms-vector-db: + image: intellabs/vdms:latest + container_name: vdms-vector-db + ports: + - "55555:55555" + dataprep-vdms: + image: opea/dataprep-vdms:latest + container_name: dataprep-vdms-server + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + VDMS_HOST: ${VDMS_HOST} + VDMS_PORT: ${VDMS_PORT} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py new file mode 100644 index 000000000..72532387f --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -0,0 +1,112 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import json +from tqdm import tqdm +from comps import DocPath, opea_microservices, opea_telemetry, register_microservice +from utils.utils import read_config, process_all_videos +from utils import store_embeddings +from utils.vclip import vCLIP + + +VECTORDB_SERVICE_HOST_IP = os.getenv("VECTORDB_SERVICE_HOST_IP", "0.0.0.0") + +def setup_vclip_model(config, device="cpu"): + model = vCLIP(config) + return model + +def read_json(path): + with open(path) as f: + x = json.load(f) + return x + +def store_into_vectordb(vs, metadata_file_path, embedding_model, config): + GMetadata = read_json(metadata_file_path) + global_counter = 0 + + total_videos = len(GMetadata.keys()) + + for idx, (video, data) in enumerate(tqdm(GMetadata.items())): + image_name_list = [] + embedding_list = [] + metadata_list = [] + ids = [] + + if config['embeddings']['type'] == 'video': + data['video'] = video + video_name_list = [data["video_path"]] + metadata_list = [data] + if vs.selected_db == 'vdms': + vs.video_db.add_videos( + paths=video_name_list, + metadatas=metadata_list, + start_time=[data['timestamp']], + clip_duration=[data['clip_duration']] + ) + else: + print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]") + + # clean up tmp_ folders containing frames (jpeg) + for i in os.listdir(): + if i.startswith("tmp_"): + print("removing tmp_*") + os.system(f"rm -r tmp_*") + print("done.") + break + +def generate_embeddings(config, embedding_model, vs): + print('inside generate') + process_all_videos(config) + global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json') + print(f'global metadata file available at {global_metadata_file_path}') + store_into_vectordb(vs, global_metadata_file_path, embedding_model, config) + +@register_microservice( + name="opea_service@prepare_doc_vdms", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +@opea_telemetry +def process_videos(doc_path: DocPath): + """Ingest videos to VDMS.""" + path = doc_path.path + print(f"Parsing videos {path}.") + + ################# + #set config_file + ################# + + config= config = read_config('./config.yaml') + meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']} + generate_frames = config['generate_frames'] + path = config['videos'] + meta_output_dir = config['meta_output_dir'] + emb_path = config['embeddings']['path'] + host = VECTORDB_SERVICE_HOST_IP + port = int(config['vector_db']['port']) + selected_db = config['vector_db']['choice_of_db'] + + # Creating DB + print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.') + print('Connecting to {} at {}:{}'.format(selected_db, host, port)) + #check embedding type + if 'video' == 'video': + # init meanclip model + model = setup_vclip_model(meanclip_cfg, device="cpu") + print('init model') + vs = store_embeddings.VideoVS(host, port, selected_db, model) + print('init vector store') + else: + print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']") + return + generate_embeddings(config, model, vs) + print('done............success..............') + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_vdms"].start() \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/requirements.txt b/comps/dataprep/vdms/multimodal_langchain/requirements.txt new file mode 100644 index 000000000..859dec9f9 --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/requirements.txt @@ -0,0 +1,39 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-core +langchain-text-splitters +langsmith +markdown +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +python-bidi==0.4.2 +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.11.5 +uvicorn +vdms +tqdm +tzlocal +opencv-python +tqdm +tzlocal +PyYAML +typing +decord +einops \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py new file mode 100644 index 000000000..8c77c9714 --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py @@ -0,0 +1,121 @@ +from langchain_community.vectorstores import VDMS +from langchain_community.vectorstores.vdms import VDMS_Client +from langchain.pydantic_v1 import BaseModel, root_validator +from langchain_core.embeddings import Embeddings +from decord import VideoReader, cpu +import numpy as np +from typing import List, Optional, Iterable, Dict, Any +from PIL import Image +import torch +import os +import time +import torchvision.transforms as T +toPIL = T.ToPILImage() + +# 'similarity', 'similarity_score_threshold' (needs threshold), 'mmr' + +class vCLIPEmbeddings(BaseModel, Embeddings): + """MeanCLIP Embeddings model.""" + + model: Any + + @root_validator(allow_reuse=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate that open_clip and torch libraries are installed.""" + try: + # Use the provided model if present + if "model" not in values: + raise ValueError("Model must be provided during initialization.") + + except ImportError: + raise ImportError( + "Please ensure CLIP model is loaded" + ) + return values + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + model_device = next(self.model.clip.parameters()).device + text_features = self.model.get_text_embeddings(texts) + + return text_features.detach().numpy() + + + def embed_query(self, text: str) -> List[float]: + return self.embed_documents([text])[0] + + + def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]: + # Open images directly as PIL images + + video_features = [] + for vid_path in sorted(paths): + # Encode the video to get the embeddings + model_device = next(self.model.parameters()).device + # Preprocess the video for the model + clip_images = self.load_video_for_vclip(vid_path, num_frm=self.model.num_frm, + max_img_size=224, + start_time=kwargs.get("start_time", None), + clip_duration=kwargs.get("clip_duration", None) + ) + embeddings_tensor = self.model.get_video_embeddings([clip_images]) + + # Convert tensor to list and add to the video_features list + embeddings_list = embeddings_tensor.tolist() + + video_features.append(embeddings_list) + + return video_features + + + def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs): + # Load video with VideoReader + vr = VideoReader(vid_path, ctx=cpu(0)) + fps = vr.get_avg_fps() + num_frames = len(vr) + start_idx = int(fps*kwargs.get("start_time", [0])[0]) + end_idx = start_idx+int(fps*kwargs.get("clip_duration", [num_frames])[0]) + + frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int) # Uniform sampling + clip_images = [] + + # read images + temp_frms = vr.get_batch(frame_idx.astype(int).tolist()) + for idx in range(temp_frms.shape[0]): + im = temp_frms[idx] # H W C + clip_images.append(toPIL(im.permute(2,0,1))) + + return clip_images + + +class VideoVS: + def __init__(self, host, port, selected_db, video_retriever_model, chosen_video_search_type="similarity"): + self.host = host + self.port = port + self.selected_db = selected_db + self.chosen_video_search_type = chosen_video_search_type + self.constraints = None + self.video_collection = 'video-test' + self.video_embedder = vCLIPEmbeddings(model=video_retriever_model) + self.chosen_video_search_type = chosen_video_search_type + + # initialize_db + self.get_db_client() + self.init_db() + + + def get_db_client(self): + + if self.selected_db == 'vdms': + print ('Connecting to VDMS db server . . .') + self.client = VDMS_Client(host=self.host, port=self.port) + + def init_db(self): + print ('Loading db instances') + if self.selected_db == 'vdms': + self.video_db = VDMS( + client=self.client, + embedding=self.video_embedder, + collection_name=self.video_collection, + engine="FaissFlat", + distance_strategy="IP" + ) \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py new file mode 100644 index 000000000..faf5527f7 --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py @@ -0,0 +1,119 @@ +import os +import time as t +from tqdm import tqdm +import cv2 +import json +import datetime +import random +from tzlocal import get_localzone +import yaml + + +def read_config(path): + with open(path, 'r') as f: + config = yaml.safe_load(f) + return config + +def calculate_intervals(video_path, chunk_duration, clip_duration): + cap = cv2.VideoCapture(video_path) + + if not cap.isOpened(): + print("Error: Could not open video.") + return + + fps = cap.get(cv2.CAP_PROP_FPS) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + total_seconds = total_frames / fps + + intervals = [] + + chunk_frames = int(chunk_duration * fps) + clip_frames = int(clip_duration * fps) + + for start_frame in range(0, total_frames, chunk_frames): + end_frame = min(start_frame + clip_frames, total_frames) + start_time = start_frame / fps + end_time = end_frame / fps + intervals.append((start_frame, end_frame, start_time, end_time)) + + cap.release() + return intervals + +def process_all_videos(config): + path = config['videos'] + meta_output_dir = config['meta_output_dir'] + selected_db = config['vector_db']['choice_of_db'] + emb_path = config['embeddings']['path'] + emb_type = config['embeddings']['type'] + chunk_duration = config['chunk_duration'] + clip_duration = config['clip_duration'] + + + videos = [file for file in os.listdir(path) if file.endswith('.mp4')] # TODO: increase supported video formats + + # print (f'Total {len(videos)} videos will be processed') + metadata = {} + + for i, each_video in enumerate(tqdm(videos)): + metadata[each_video] = {} + keyname = each_video + video_path = os.path.join(path, each_video) + date_time = datetime.datetime.now() # FIXME CHECK: is this correct? + #date_time = t.ctime(os.stat(video_path).st_ctime) + # Get the local timezone of the machine + local_timezone = get_localzone() + if emb_type == 'video': + time_format = "%a %b %d %H:%M:%S %Y" + if not isinstance(date_time, datetime.datetime): + date_time = datetime.datetime.strptime(date_time, time_format) + time = date_time.strftime("%H:%M:%S") + hours, minutes, seconds = map(float, time.split(":")) + date = date_time.strftime("%Y-%m-%d") + year, month, day = map(int, date.split("-")) + + if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration: + interval_count = 0 + metadata.pop(each_video) + for start_frame, end_frame, start_time, end_time in calculate_intervals(video_path, chunk_duration, clip_duration): + keyname = os.path.splitext(os.path.basename(video_path))[0]+f"_interval_{interval_count}" + metadata[keyname] = {"timestamp":start_time} + metadata[keyname].update({"date": date, "year": year, "month": month, "day": day, + "time": time, "hours": hours, "minutes": minutes, "seconds": seconds}) + if selected_db == 'vdms': + # Localize the current time to the local timezone of the machine + #Tahani might not need this + current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone) + + # Convert the localized time to ISO 8601 format with timezone offset + iso_date_time = current_time_local.isoformat() + metadata[keyname]['date_time'] = {"_date": str(iso_date_time)} + + # Open the video file + cap = cv2.VideoCapture(video_path) + + if int(cv2.__version__.split('.')[0]) < 3: + fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) + else: + fps = cap.get(cv2.CAP_PROP_FPS) + + total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) + # get the duration + metadata[keyname].update({ + "clip_duration":(min(total_frames,end_frame)-start_frame)/fps, + 'fps': fps, + 'total_frames': total_frames, + #'embedding_path': os.path.join(emb_path, each_video+".pt"), + 'video_path': f'{os.path.join(path,each_video)}', + }) + cap.release() + interval_count+=1 + metadata[keyname].update({ + 'fps': fps, + 'total_frames': total_frames, + #'embedding_path': os.path.join(emb_path, each_video+".pt"), + 'video_path': f'{os.path.join(path,each_video)}', + }) + os.makedirs(meta_output_dir, exist_ok=True) + metadata_file = os.path.join(meta_output_dir, f"metadata.json") + with open(metadata_file, "w") as f: + json.dump(metadata, f, indent=4) \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py new file mode 100644 index 000000000..44d290397 --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py @@ -0,0 +1,58 @@ +import yaml +import json +import os, sys +import argparse +import torch +import numpy as np +from decord import VideoReader, cpu +from transformers import AutoTokenizer, AutoProcessor, CLIPModel +import torchvision.transforms as T +toPIL = T.ToPILImage() +import torch.nn as nn +from einops import rearrange + +class vCLIP(nn.Module): + def __init__(self, cfg): + super().__init__() + + self.num_frm = cfg["num_frm"] + self.model_name = cfg["model_name"] + + self.clip = CLIPModel.from_pretrained(self.model_name) + self.processor = AutoProcessor.from_pretrained(self.model_name) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + + def get_text_embeddings(self, texts): + """ + input is list of texts + """ + text_inputs = self.tokenizer(texts, padding=True, return_tensors="pt") + text_features = self.clip.get_text_features(**text_inputs) + return text_features + + + def get_image_embeddings(self, images): + """ + input is list of images + """ + image_inputs = self.processor(images=images, return_tensors="pt") + image_features = self.clip.get_image_features(**image_inputs) + return image_features + + + def get_video_embeddings(self, frames_batch): + """ + input is list of list of frames in video + """ + self.batch_size = len(frames_batch) + vid_embs = [] + for frames in frames_batch: + frame_embeddings = self.get_image_embeddings(frames) + frame_embeddings = rearrange(frame_embeddings, "(b n) d -> b n d", b=len(frames_batch)) + # Normalize, mean aggregate and return normalized video_embeddings + frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True) + video_embeddings = frame_embeddings.mean(dim=1) + video_embeddings = video_embeddings / video_embeddings.norm(dim=-1, keepdim=True) + vid_embs.append(video_embeddings) + return torch.cat(vid_embs, dim=0) \ No newline at end of file From e87b159c402461fbd37e1524922574da88b780a6 Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 3 Sep 2024 14:26:37 -0700 Subject: [PATCH 02/29] dataprep updates Signed-off-by: srinarayan-srikanthan --- comps/dataprep/vdms/multimodal_langchain/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml index b164d263a..ba7b33fb6 100644 --- a/comps/dataprep/vdms/multimodal_langchain/config.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -1,5 +1,5 @@ # Path to all videos -videos: video_ingest/videos/ +videos: uploaded_files/videos/ # Do you want to extract frames of videos (True if not done already, else False) generate_frames: True # How do you want to generate feature embeddings? @@ -7,13 +7,13 @@ embeddings: type: 'video' vclip_model_name: "openai/clip-vit-base-patch32" vclip_num_frm: 64 - path: 'video_ingest/embeddings' + path: 'uploaded_files/embeddings' # VL-branch config vl_branch: cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml model_type: 'llama_v2' # Path to store metadata files -meta_output_dir: video_ingest/video_metadata/ +meta_output_dir: uploaded_files/video_metadata/ # Chunk duration defines the interval of time that each embedding will occur chunk_duration: 30 # Clip duration defines the length of the interval in which the embeding will occur From dc3b5b77424cdd1ffa4a3dbc56e149c3a5c7c607 Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 3 Sep 2024 22:26:10 -0700 Subject: [PATCH 03/29] rearranged dirs Signed-off-by: srinarayan-srikanthan --- .../vdms/multimodal_langchain/config.yaml | 2 +- .../multimodal_langchain/docker/Dockerfile | 2 +- .../multimodal_langchain/ingest_videos.py | 49 +++++++++++++------ .../utils/store_embeddings.py | 2 + 4 files changed, 39 insertions(+), 16 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml index ba7b33fb6..34f7cffeb 100644 --- a/comps/dataprep/vdms/multimodal_langchain/config.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -1,5 +1,5 @@ # Path to all videos -videos: uploaded_files/videos/ +videos: uploaded_files/ # Do you want to extract frames of videos (True if not done already, else False) generate_frames: True # How do you want to generate feature embeddings? diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile index 505448a77..a0de62cba 100644 --- a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile +++ b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile @@ -31,7 +31,7 @@ ENV PYTHONPATH=/home/user USER root -RUN mkdir -p /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files +RUN mkdir -p /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/multimodal_langchain USER user diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index 72532387f..0d3018537 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -5,13 +5,16 @@ import os import json from tqdm import tqdm -from comps import DocPath, opea_microservices, opea_telemetry, register_microservice +from comps import opea_microservices, register_microservice from utils.utils import read_config, process_all_videos from utils import store_embeddings from utils.vclip import vCLIP +from fastapi import File, HTTPException, UploadFile +import uuid +from typing import Any, Dict, Iterable, List, Optional, Type, Union +import shutil - -VECTORDB_SERVICE_HOST_IP = os.getenv("VECTORDB_SERVICE_HOST_IP", "0.0.0.0") +VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0") def setup_vclip_model(config, device="cpu"): model = vCLIP(config) @@ -55,6 +58,10 @@ def store_into_vectordb(vs, metadata_file_path, embedding_model, config): os.system(f"rm -r tmp_*") print("done.") break + +def generate_video_id(): + """Generates a unique identifier for a video file.""" + return str(uuid.uuid4()) def generate_embeddings(config, embedding_model, vs): print('inside generate') @@ -67,19 +74,11 @@ def generate_embeddings(config, embedding_model, vs): name="opea_service@prepare_doc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", - port=6007, - input_datatype=DocPath, - output_datatype=None, + port=6007 ) -@opea_telemetry -def process_videos(doc_path: DocPath): - """Ingest videos to VDMS.""" - path = doc_path.path - print(f"Parsing videos {path}.") - ################# - #set config_file - ################# +def process_videos(files: List[UploadFile] = File(None)): + """Ingest videos to VDMS.""" config= config = read_config('./config.yaml') meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']} @@ -90,6 +89,28 @@ def process_videos(doc_path: DocPath): host = VECTORDB_SERVICE_HOST_IP port = int(config['vector_db']['port']) selected_db = config['vector_db']['choice_of_db'] + print(f"Parsing videos {path}.") + + #Saving videos + if files: + video_files = [] + for file in files: + if os.path.splitext(file.filename)[1] == ".mp4": + video_files.append(file) + else: + raise HTTPException( + status_code=400, detail=f"File {file.filename} is not an mp4 file. Please upload mp4 files only." + ) + + for video_file in video_files: + video_id = generate_video_id() + video_name = os.path.splitext(video_file.filename)[0] + video_file_name = f"{video_name}_{video_id}.mp4" + video_dir_name = os.path.splitext(video_file_name)[0] + # Save video file in upload_directory + with open(os.path.join(path, video_file_name), "wb") as f: + shutil.copyfileobj(video_file.file, f) + # Creating DB print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.') diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py index 8c77c9714..6e5d849d7 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py @@ -69,6 +69,8 @@ def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]: def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs): # Load video with VideoReader + import decord + decord.bridge.set_bridge('torch') vr = VideoReader(vid_path, ctx=cpu(0)) fps = vr.get_avg_fps() num_frames = len(vr) From 4045cb8549d9fe362a166c8439e1564c55c6f54d Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 3 Sep 2024 22:38:51 -0700 Subject: [PATCH 04/29] added readme Signed-off-by: srinarayan-srikanthan --- .../vdms/multimodal_langchain/README.md | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 comps/dataprep/vdms/multimodal_langchain/README.md diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md new file mode 100644 index 000000000..c3579ceb5 --- /dev/null +++ b/comps/dataprep/vdms/multimodal_langchain/README.md @@ -0,0 +1,114 @@ +# Multimodal Dataprep Microservice with VDMS + +For dataprep microservice, we currently provide one framework: `Langchain`. + +# 🚀1. Start Microservice with Python (Option 1) + +## 1.1 Install Requirements + +- option 1: Install Single-process version (for 1-10 files processing) + +```bash +apt-get update +apt-get install -y default-jre tesseract-ocr libtesseract-dev poppler-utils +pip install -r requirements.txt +``` + +## 1.2 Start VDMS Server + +```bash +docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest +``` + +## 1.3 Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export host_ip=$(hostname -I | awk '{print $1}') +export VDMS_HOST=${host_ip} +export VDMS_PORT=55555 +export your_hf_api_token="{your_hf_token}" +export PYTHONPATH=${path_to_comps} +``` + +## 1.4 Start Data Preparation Microservice for VDMS with Python Script + +Start document preparation microservice for VDMS with below command. + + +```bash +python ingest_videos.py +``` + +# 🚀2. Start Microservice with Docker (Option 2) + +## 2.1 Start VDMS Server + + +```bash +docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest +``` + + +## 2.1 Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export host_ip=$(hostname -I | awk '{print $1}') +export VDMS_HOST=${host_ip} +export VDMS_PORT=55555 +export your_hf_api_token="{your_hf_token}" +``` + +## 2.3 Build Docker Image + +- Build docker image +```bash +cd ../../../ + docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile . + +``` + + +## 2.4 Run Docker Compose + + +```bash +docker compose -f comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml up -d +``` + + +# 🚀3. Status Microservice + +```bash +docker container logs -f dataprep-vdms-server +``` + +# 🚀4. Consume Microservice + +Once data preparation microservice for VDMS is started, user can use below command to invoke the microservice to convert the videos to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + + +- Single file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.mp4" \ + http://localhost:6007/v1/dataprep +``` +- Multiple file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.mp4" \ + -F "files=@./file2.mp4" \ + -F "files=@./file3.mp4" \ + http://localhost:6007/v1/dataprep +``` + From d4c9441a54300b8acb7e2599ed9bfd80e383dc5f Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 3 Sep 2024 22:44:32 -0700 Subject: [PATCH 05/29] removed checks Signed-off-by: srinarayan-srikanthan --- comps/dataprep/vdms/multimodal_langchain/ingest_videos.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index 0d3018537..8a5ff982d 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -56,7 +56,6 @@ def store_into_vectordb(vs, metadata_file_path, embedding_model, config): if i.startswith("tmp_"): print("removing tmp_*") os.system(f"rm -r tmp_*") - print("done.") break def generate_video_id(): @@ -64,7 +63,6 @@ def generate_video_id(): return str(uuid.uuid4()) def generate_embeddings(config, embedding_model, vs): - print('inside generate') process_all_videos(config) global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json') print(f'global metadata file available at {global_metadata_file_path}') @@ -119,14 +117,13 @@ def process_videos(files: List[UploadFile] = File(None)): if 'video' == 'video': # init meanclip model model = setup_vclip_model(meanclip_cfg, device="cpu") - print('init model') vs = store_embeddings.VideoVS(host, port, selected_db, model) - print('init vector store') + else: print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']") return generate_embeddings(config, model, vs) - print('done............success..............') + if __name__ == "__main__": From 40117cb208834fbfefb01fb4ede2c2eb049428f2 Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Wed, 4 Sep 2024 14:19:33 -0700 Subject: [PATCH 06/29] added features Signed-off-by: srinarayan-srikanthan --- .../vdms/multimodal_langchain/README.md | 2 + .../vdms/multimodal_langchain/config.yaml | 3 -- .../docker/docker-compose-dataprep-vdms.yaml | 2 +- .../multimodal_langchain/ingest_videos.py | 43 ++++++++++++++++--- .../utils/store_embeddings.py | 4 +- 5 files changed, 42 insertions(+), 12 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md index c3579ceb5..6cd5828e5 100644 --- a/comps/dataprep/vdms/multimodal_langchain/README.md +++ b/comps/dataprep/vdms/multimodal_langchain/README.md @@ -28,6 +28,7 @@ export https_proxy=${your_http_proxy} export host_ip=$(hostname -I | awk '{print $1}') export VDMS_HOST=${host_ip} export VDMS_PORT=55555 +export INDEX_NAME="rag-vdms" export your_hf_api_token="{your_hf_token}" export PYTHONPATH=${path_to_comps} ``` @@ -59,6 +60,7 @@ export https_proxy=${your_http_proxy} export host_ip=$(hostname -I | awk '{print $1}') export VDMS_HOST=${host_ip} export VDMS_PORT=55555 +export INDEX_NAME="rag-vdms" export your_hf_api_token="{your_hf_token}" ``` diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml index 34f7cffeb..209110920 100644 --- a/comps/dataprep/vdms/multimodal_langchain/config.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -22,9 +22,6 @@ clip_duration: 10 vector_db: choice_of_db: 'vdms' # #Supported databases [vdms] - host: 0.0.0.0 - port: 55555 - # LLM path model_path: meta-llama/Llama-2-7b-chat-hf \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml index edb733c7d..a08aa1877 100644 --- a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml @@ -20,7 +20,7 @@ services: https_proxy: ${https_proxy} VDMS_HOST: ${VDMS_HOST} VDMS_PORT: ${VDMS_PORT} - COLLECTION_NAME: ${COLLECTION_NAME} + INDEX_NAME: ${INDEX_NAME} restart: unless-stopped networks: diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index 8a5ff982d..d5bb2c321 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -10,11 +10,15 @@ from utils import store_embeddings from utils.vclip import vCLIP from fastapi import File, HTTPException, UploadFile +from fastapi.responses import FileResponse import uuid from typing import Any, Dict, Iterable, List, Optional, Type, Union import shutil +from pathlib import Path VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0") +VECTORDB_SERVICE_PORT = os.getenv("VDMS_PORT", 55555) +collection_name = os.getenv("INDEX_NAME", "rag-vdms") def setup_vclip_model(config, device="cpu"): model = vCLIP(config) @@ -69,23 +73,23 @@ def generate_embeddings(config, embedding_model, vs): store_into_vectordb(vs, global_metadata_file_path, embedding_model, config) @register_microservice( - name="opea_service@prepare_doc_vdms", + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007 ) -def process_videos(files: List[UploadFile] = File(None)): +async def process_videos(files: List[UploadFile] = File(None)): """Ingest videos to VDMS.""" - config= config = read_config('./config.yaml') + config = read_config('./config.yaml') meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']} generate_frames = config['generate_frames'] path = config['videos'] meta_output_dir = config['meta_output_dir'] emb_path = config['embeddings']['path'] host = VECTORDB_SERVICE_HOST_IP - port = int(config['vector_db']['port']) + port = int(VECTORDB_SERVICE_PORT) selected_db = config['vector_db']['choice_of_db'] print(f"Parsing videos {path}.") @@ -117,14 +121,41 @@ def process_videos(files: List[UploadFile] = File(None)): if 'video' == 'video': # init meanclip model model = setup_vclip_model(meanclip_cfg, device="cpu") - vs = store_embeddings.VideoVS(host, port, selected_db, model) + vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name) else: print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']") return generate_embeddings(config, model, vs) +@register_microservice( + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007 +) +async def rag_get_file_structure(): + """Returns list of names of uploaded videos saved on the server.""" + config = read_config('./config.yaml') + if not Path( config['videos']).exists(): + print("No file uploaded, return empty list.") + return [] + + uploaded_videos = os.listdir(config['videos']) + mp4_files = [file for file in uploaded_videos if file.endswith(".mp4")] + return mp4_files + +@register_microservice( + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007 +) +async def rag_get_file(filename: str): + """Download the file from remote.""" + + config = read_config('./config.yaml') + UPLOAD_DIR=config['videos'] + file_path = os.path.join(UPLOAD_DIR, filename) + if os.path.exists(file_path): + return FileResponse(path=file_path, filename=filename) + else: + return {"error": "File not found"} if __name__ == "__main__": - opea_microservices["opea_service@prepare_doc_vdms"].start() \ No newline at end of file + opea_microservices["opea_service@prepare_videodoc_vdms"].start() \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py index 6e5d849d7..3487c675b 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py @@ -90,13 +90,13 @@ def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs): class VideoVS: - def __init__(self, host, port, selected_db, video_retriever_model, chosen_video_search_type="similarity"): + def __init__(self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity"): self.host = host self.port = port self.selected_db = selected_db self.chosen_video_search_type = chosen_video_search_type self.constraints = None - self.video_collection = 'video-test' + self.video_collection = collection_name self.video_embedder = vCLIPEmbeddings(model=video_retriever_model) self.chosen_video_search_type = chosen_video_search_type From f9d1e2b3979532e339985b88e7e52baae98f856e Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Wed, 4 Sep 2024 21:49:25 -0700 Subject: [PATCH 07/29] added get method Signed-off-by: srinarayan-srikanthan --- comps/dataprep/vdms/multimodal_langchain/README.md | 4 ++++ comps/dataprep/vdms/multimodal_langchain/ingest_videos.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md index 6cd5828e5..44c54df97 100644 --- a/comps/dataprep/vdms/multimodal_langchain/README.md +++ b/comps/dataprep/vdms/multimodal_langchain/README.md @@ -113,4 +113,8 @@ curl -X POST \ -F "files=@./file3.mp4" \ http://localhost:6007/v1/dataprep ``` +- List of uploaded files +```bash +curl -X POST http://localhost:6007/v1/dataprep/get_videos +``` diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index d5bb2c321..fa1467eab 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -129,7 +129,7 @@ async def process_videos(files: List[UploadFile] = File(None)): generate_embeddings(config, model, vs) @register_microservice( - name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007 + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007,methods=["GET"] ) async def rag_get_file_structure(): """Returns list of names of uploaded videos saved on the server.""" @@ -143,7 +143,7 @@ async def rag_get_file_structure(): return mp4_files @register_microservice( - name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007 + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007,methods=["GET"] ) async def rag_get_file(filename: str): """Download the file from remote.""" From ea8e83eac4a0d41026229d63ffbcb461db3b1ecf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Sep 2024 04:56:10 +0000 Subject: [PATCH 08/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/vdms/langchain/__init__.py | 2 +- comps/dataprep/vdms/langchain/config.py | 2 +- .../docker/docker-compose-dataprep-vdms.yaml | 2 +- .../vdms/langchain/prepare_doc_vdms.py | 3 +- .../dataprep/vdms/langchain/requirements.txt | 16 +-- .../vdms/multimodal_langchain/README.md | 12 +- .../vdms/multimodal_langchain/__init__.py | 2 +- .../vdms/multimodal_langchain/config.yaml | 15 +- .../docker/docker-compose-dataprep-vdms.yaml | 2 +- .../multimodal_langchain/ingest_videos.py | 129 ++++++++++-------- .../multimodal_langchain/requirements.txt | 16 +-- .../utils/store_embeddings.py | 71 +++++----- .../vdms/multimodal_langchain/utils/utils.py | 109 +++++++++------ .../vdms/multimodal_langchain/utils/vclip.py | 40 +++--- 14 files changed, 228 insertions(+), 193 deletions(-) diff --git a/comps/dataprep/vdms/langchain/__init__.py b/comps/dataprep/vdms/langchain/__init__.py index 4582b4f9a..916f3a44b 100644 --- a/comps/dataprep/vdms/langchain/__init__.py +++ b/comps/dataprep/vdms/langchain/__init__.py @@ -1,2 +1,2 @@ # Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/vdms/langchain/config.py b/comps/dataprep/vdms/langchain/config.py index 3e3e06a16..e12ba1502 100644 --- a/comps/dataprep/vdms/langchain/config.py +++ b/comps/dataprep/vdms/langchain/config.py @@ -30,4 +30,4 @@ def getEnv(key, default_value=None): CHUNK_OVERLAP = getEnv("CHUNK_OVERLAP", 100) current_file_path = os.path.abspath(__file__) -parent_dir = os.path.dirname(current_file_path) \ No newline at end of file +parent_dir = os.path.dirname(current_file_path) diff --git a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml index edb733c7d..46880119e 100644 --- a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml +++ b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml @@ -25,4 +25,4 @@ services: networks: default: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py index 13591dd26..e6f7d0072 100644 --- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py +++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -78,4 +77,4 @@ def ingest_documents(doc_path: DocPath): if __name__ == "__main__": - opea_microservices["opea_service@prepare_doc_vdms"].start() \ No newline at end of file + opea_microservices["opea_service@prepare_doc_vdms"].start() diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt index 859dec9f9..f6044266c 100644 --- a/comps/dataprep/vdms/langchain/requirements.txt +++ b/comps/dataprep/vdms/langchain/requirements.txt @@ -1,8 +1,10 @@ beautifulsoup4 cairosvg +decord docarray[full] docx2txt easyocr +einops fastapi huggingface_hub langchain @@ -12,6 +14,7 @@ langchain-text-splitters langsmith markdown numpy +opencv-python opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk @@ -23,17 +26,12 @@ pyspark python-bidi==0.4.2 python-docx python-pptx +PyYAML sentence_transformers shortuuid +tqdm +typing +tzlocal unstructured[all-docs]==0.11.5 uvicorn vdms -tqdm -tzlocal -opencv-python -tqdm -tzlocal -PyYAML -typing -decord -einops \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md index 44c54df97..54e878b65 100644 --- a/comps/dataprep/vdms/multimodal_langchain/README.md +++ b/comps/dataprep/vdms/multimodal_langchain/README.md @@ -37,7 +37,6 @@ export PYTHONPATH=${path_to_comps} Start document preparation microservice for VDMS with below command. - ```bash python ingest_videos.py ``` @@ -46,12 +45,10 @@ python ingest_videos.py ## 2.1 Start VDMS Server - ```bash docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest ``` - ## 2.1 Setup Environment Variables ```bash @@ -66,22 +63,20 @@ export your_hf_api_token="{your_hf_token}" ## 2.3 Build Docker Image -- Build docker image +- Build docker image + ```bash cd ../../../ docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile . ``` - ## 2.4 Run Docker Compose - ```bash docker compose -f comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml up -d ``` - # 🚀3. Status Microservice ```bash @@ -94,7 +89,6 @@ Once data preparation microservice for VDMS is started, user can use below comma Make sure the file path after `files=@` is correct. - - Single file upload ```bash @@ -103,6 +97,7 @@ curl -X POST \ -F "files=@./file1.mp4" \ http://localhost:6007/v1/dataprep ``` + - Multiple file upload ```bash @@ -113,6 +108,7 @@ curl -X POST \ -F "files=@./file3.mp4" \ http://localhost:6007/v1/dataprep ``` + - List of uploaded files ```bash diff --git a/comps/dataprep/vdms/multimodal_langchain/__init__.py b/comps/dataprep/vdms/multimodal_langchain/__init__.py index 4582b4f9a..916f3a44b 100644 --- a/comps/dataprep/vdms/multimodal_langchain/__init__.py +++ b/comps/dataprep/vdms/multimodal_langchain/__init__.py @@ -1,2 +1,2 @@ # Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml index 209110920..40c327615 100644 --- a/comps/dataprep/vdms/multimodal_langchain/config.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -1,27 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + # Path to all videos videos: uploaded_files/ # Do you want to extract frames of videos (True if not done already, else False) generate_frames: True # How do you want to generate feature embeddings? embeddings: - type: 'video' + type: "video" vclip_model_name: "openai/clip-vit-base-patch32" vclip_num_frm: 64 - path: 'uploaded_files/embeddings' + path: "uploaded_files/embeddings" # VL-branch config vl_branch: cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml - model_type: 'llama_v2' + model_type: "llama_v2" # Path to store metadata files meta_output_dir: uploaded_files/video_metadata/ # Chunk duration defines the interval of time that each embedding will occur chunk_duration: 30 -# Clip duration defines the length of the interval in which the embeding will occur +# Clip duration defines the length of the interval in which the embedding will occur clip_duration: 10 # e.g. For every , you embed the first 's frames of that interval vector_db: - choice_of_db: 'vdms' # #Supported databases [vdms] + choice_of_db: "vdms" # #Supported databases [vdms] # LLM path -model_path: meta-llama/Llama-2-7b-chat-hf \ No newline at end of file +model_path: meta-llama/Llama-2-7b-chat-hf diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml index a08aa1877..785dc6408 100644 --- a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml @@ -25,4 +25,4 @@ services: networks: default: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index fa1467eab..eeadcb1e1 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -1,56 +1,60 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os import json +import os +import shutil +import uuid +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Type, Union + +from fastapi import File, HTTPException, UploadFile +from fastapi.responses import FileResponse from tqdm import tqdm -from comps import opea_microservices, register_microservice -from utils.utils import read_config, process_all_videos from utils import store_embeddings +from utils.utils import process_all_videos, read_config from utils.vclip import vCLIP -from fastapi import File, HTTPException, UploadFile -from fastapi.responses import FileResponse -import uuid -from typing import Any, Dict, Iterable, List, Optional, Type, Union -import shutil -from pathlib import Path + +from comps import opea_microservices, register_microservice VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0") VECTORDB_SERVICE_PORT = os.getenv("VDMS_PORT", 55555) collection_name = os.getenv("INDEX_NAME", "rag-vdms") + def setup_vclip_model(config, device="cpu"): model = vCLIP(config) return model + def read_json(path): with open(path) as f: x = json.load(f) return x + def store_into_vectordb(vs, metadata_file_path, embedding_model, config): GMetadata = read_json(metadata_file_path) global_counter = 0 total_videos = len(GMetadata.keys()) - + for idx, (video, data) in enumerate(tqdm(GMetadata.items())): image_name_list = [] embedding_list = [] metadata_list = [] ids = [] - - if config['embeddings']['type'] == 'video': - data['video'] = video + + if config["embeddings"]["type"] == "video": + data["video"] = video video_name_list = [data["video_path"]] metadata_list = [data] - if vs.selected_db == 'vdms': + if vs.selected_db == "vdms": vs.video_db.add_videos( paths=video_name_list, metadatas=metadata_list, - start_time=[data['timestamp']], - clip_duration=[data['clip_duration']] + start_time=[data["timestamp"]], + clip_duration=[data["clip_duration"]], ) else: print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]") @@ -59,41 +63,41 @@ def store_into_vectordb(vs, metadata_file_path, embedding_model, config): for i in os.listdir(): if i.startswith("tmp_"): print("removing tmp_*") - os.system(f"rm -r tmp_*") + os.system("rm -r tmp_*") break - + + def generate_video_id(): """Generates a unique identifier for a video file.""" - return str(uuid.uuid4()) + return str(uuid.uuid4()) + def generate_embeddings(config, embedding_model, vs): process_all_videos(config) - global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json') - print(f'global metadata file available at {global_metadata_file_path}') + global_metadata_file_path = os.path.join(config["meta_output_dir"], "metadata.json") + print(f"global metadata file available at {global_metadata_file_path}") store_into_vectordb(vs, global_metadata_file_path, embedding_model, config) - -@register_microservice( - name="opea_service@prepare_videodoc_vdms", - endpoint="/v1/dataprep", - host="0.0.0.0", - port=6007 -) + +@register_microservice(name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) async def process_videos(files: List[UploadFile] = File(None)): """Ingest videos to VDMS.""" - - config = read_config('./config.yaml') - meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']} - generate_frames = config['generate_frames'] - path = config['videos'] - meta_output_dir = config['meta_output_dir'] - emb_path = config['embeddings']['path'] + + config = read_config("./config.yaml") + meanclip_cfg = { + "model_name": config["embeddings"]["vclip_model_name"], + "num_frm": config["embeddings"]["vclip_num_frm"], + } + generate_frames = config["generate_frames"] + path = config["videos"] + meta_output_dir = config["meta_output_dir"] + emb_path = config["embeddings"]["path"] host = VECTORDB_SERVICE_HOST_IP port = int(VECTORDB_SERVICE_PORT) - selected_db = config['vector_db']['choice_of_db'] + selected_db = config["vector_db"]["choice_of_db"] print(f"Parsing videos {path}.") - - #Saving videos + + # Saving videos if files: video_files = [] for file in files: @@ -113,43 +117,56 @@ async def process_videos(files: List[UploadFile] = File(None)): with open(os.path.join(path, video_file_name), "wb") as f: shutil.copyfileobj(video_file.file, f) - # Creating DB - print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.') - print('Connecting to {} at {}:{}'.format(selected_db, host, port)) - #check embedding type - if 'video' == 'video': + print( + "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time." + ) + print("Connecting to {} at {}:{}".format(selected_db, host, port)) + # check embedding type + if "video" == "video": # init meanclip model model = setup_vclip_model(meanclip_cfg, device="cpu") - vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name) + vs = store_embeddings.VideoVS(host, port, selected_db, model, collection_name) else: - print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']") + print( + f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in ['video', 'frame']" + ) return generate_embeddings(config, model, vs) - + + @register_microservice( - name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007,methods=["GET"] + name="opea_service@prepare_videodoc_vdms", + endpoint="/v1/dataprep/get_videos", + host="0.0.0.0", + port=6007, + methods=["GET"], ) async def rag_get_file_structure(): """Returns list of names of uploaded videos saved on the server.""" - config = read_config('./config.yaml') - if not Path( config['videos']).exists(): + config = read_config("./config.yaml") + if not Path(config["videos"]).exists(): print("No file uploaded, return empty list.") return [] - uploaded_videos = os.listdir(config['videos']) + uploaded_videos = os.listdir(config["videos"]) mp4_files = [file for file in uploaded_videos if file.endswith(".mp4")] return mp4_files + @register_microservice( - name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007,methods=["GET"] + name="opea_service@prepare_videodoc_vdms", + endpoint="/v1/dataprep/get_file/{filename}", + host="0.0.0.0", + port=6007, + methods=["GET"], ) async def rag_get_file(filename: str): """Download the file from remote.""" - - config = read_config('./config.yaml') - UPLOAD_DIR=config['videos'] + + config = read_config("./config.yaml") + UPLOAD_DIR = config["videos"] file_path = os.path.join(UPLOAD_DIR, filename) if os.path.exists(file_path): return FileResponse(path=file_path, filename=filename) @@ -158,4 +175,4 @@ async def rag_get_file(filename: str): if __name__ == "__main__": - opea_microservices["opea_service@prepare_videodoc_vdms"].start() \ No newline at end of file + opea_microservices["opea_service@prepare_videodoc_vdms"].start() diff --git a/comps/dataprep/vdms/multimodal_langchain/requirements.txt b/comps/dataprep/vdms/multimodal_langchain/requirements.txt index 859dec9f9..f6044266c 100644 --- a/comps/dataprep/vdms/multimodal_langchain/requirements.txt +++ b/comps/dataprep/vdms/multimodal_langchain/requirements.txt @@ -1,8 +1,10 @@ beautifulsoup4 cairosvg +decord docarray[full] docx2txt easyocr +einops fastapi huggingface_hub langchain @@ -12,6 +14,7 @@ langchain-text-splitters langsmith markdown numpy +opencv-python opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk @@ -23,17 +26,12 @@ pyspark python-bidi==0.4.2 python-docx python-pptx +PyYAML sentence_transformers shortuuid +tqdm +typing +tzlocal unstructured[all-docs]==0.11.5 uvicorn vdms -tqdm -tzlocal -opencv-python -tqdm -tzlocal -PyYAML -typing -decord -einops \ No newline at end of file diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py index 3487c675b..6468e5195 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py @@ -1,19 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Any, Dict, Iterable, List, Optional + +import numpy as np +import torch +import torchvision.transforms as T +from decord import VideoReader, cpu +from langchain.pydantic_v1 import BaseModel, root_validator from langchain_community.vectorstores import VDMS from langchain_community.vectorstores.vdms import VDMS_Client -from langchain.pydantic_v1 import BaseModel, root_validator from langchain_core.embeddings import Embeddings -from decord import VideoReader, cpu -import numpy as np -from typing import List, Optional, Iterable, Dict, Any from PIL import Image -import torch -import os -import time -import torchvision.transforms as T + toPIL = T.ToPILImage() # 'similarity', 'similarity_score_threshold' (needs threshold), 'mmr' + class vCLIPEmbeddings(BaseModel, Embeddings): """MeanCLIP Embeddings model.""" @@ -28,9 +34,7 @@ def validate_environment(cls, values: Dict) -> Dict: raise ValueError("Model must be provided during initialization.") except ImportError: - raise ImportError( - "Please ensure CLIP model is loaded" - ) + raise ImportError("Please ensure CLIP model is loaded") return values def embed_documents(self, texts: List[str]) -> List[List[float]]: @@ -39,11 +43,9 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: return text_features.detach().numpy() - def embed_query(self, text: str) -> List[float]: return self.embed_documents([text])[0] - def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]: # Open images directly as PIL images @@ -52,11 +54,13 @@ def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]: # Encode the video to get the embeddings model_device = next(self.model.parameters()).device # Preprocess the video for the model - clip_images = self.load_video_for_vclip(vid_path, num_frm=self.model.num_frm, - max_img_size=224, - start_time=kwargs.get("start_time", None), - clip_duration=kwargs.get("clip_duration", None) - ) + clip_images = self.load_video_for_vclip( + vid_path, + num_frm=self.model.num_frm, + max_img_size=224, + start_time=kwargs.get("start_time", None), + clip_duration=kwargs.get("clip_duration", None), + ) embeddings_tensor = self.model.get_video_embeddings([clip_images]) # Convert tensor to list and add to the video_features list @@ -66,31 +70,33 @@ def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]: return video_features - def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs): # Load video with VideoReader import decord - decord.bridge.set_bridge('torch') + + decord.bridge.set_bridge("torch") vr = VideoReader(vid_path, ctx=cpu(0)) fps = vr.get_avg_fps() num_frames = len(vr) - start_idx = int(fps*kwargs.get("start_time", [0])[0]) - end_idx = start_idx+int(fps*kwargs.get("clip_duration", [num_frames])[0]) + start_idx = int(fps * kwargs.get("start_time", [0])[0]) + end_idx = start_idx + int(fps * kwargs.get("clip_duration", [num_frames])[0]) - frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int) # Uniform sampling + frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int) # Uniform sampling clip_images = [] # read images temp_frms = vr.get_batch(frame_idx.astype(int).tolist()) for idx in range(temp_frms.shape[0]): - im = temp_frms[idx] # H W C - clip_images.append(toPIL(im.permute(2,0,1))) + im = temp_frms[idx] # H W C + clip_images.append(toPIL(im.permute(2, 0, 1))) return clip_images class VideoVS: - def __init__(self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity"): + def __init__( + self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity" + ): self.host = host self.port = port self.selected_db = selected_db @@ -104,20 +110,19 @@ def __init__(self, host, port, selected_db, video_retriever_model, collection_na self.get_db_client() self.init_db() - def get_db_client(self): - if self.selected_db == 'vdms': - print ('Connecting to VDMS db server . . .') + if self.selected_db == "vdms": + print("Connecting to VDMS db server . . .") self.client = VDMS_Client(host=self.host, port=self.port) def init_db(self): - print ('Loading db instances') - if self.selected_db == 'vdms': + print("Loading db instances") + if self.selected_db == "vdms": self.video_db = VDMS( client=self.client, embedding=self.video_embedder, collection_name=self.video_collection, engine="FaissFlat", - distance_strategy="IP" - ) \ No newline at end of file + distance_strategy="IP", + ) diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py index faf5527f7..d83c0319f 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py @@ -1,19 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import datetime +import json import os +import random import time as t -from tqdm import tqdm + import cv2 -import json -import datetime -import random -from tzlocal import get_localzone import yaml +from tqdm import tqdm +from tzlocal import get_localzone def read_config(path): - with open(path, 'r') as f: + with open(path, "r") as f: config = yaml.safe_load(f) return config - + + def calculate_intervals(video_path, chunk_duration, clip_duration): cap = cv2.VideoCapture(video_path) @@ -39,30 +44,30 @@ def calculate_intervals(video_path, chunk_duration, clip_duration): cap.release() return intervals -def process_all_videos(config): - path = config['videos'] - meta_output_dir = config['meta_output_dir'] - selected_db = config['vector_db']['choice_of_db'] - emb_path = config['embeddings']['path'] - emb_type = config['embeddings']['type'] - chunk_duration = config['chunk_duration'] - clip_duration = config['clip_duration'] +def process_all_videos(config): + path = config["videos"] + meta_output_dir = config["meta_output_dir"] + selected_db = config["vector_db"]["choice_of_db"] + emb_path = config["embeddings"]["path"] + emb_type = config["embeddings"]["type"] + chunk_duration = config["chunk_duration"] + clip_duration = config["clip_duration"] - videos = [file for file in os.listdir(path) if file.endswith('.mp4')] # TODO: increase supported video formats + videos = [file for file in os.listdir(path) if file.endswith(".mp4")] # TODO: increase supported video formats # print (f'Total {len(videos)} videos will be processed') metadata = {} - + for i, each_video in enumerate(tqdm(videos)): metadata[each_video] = {} keyname = each_video video_path = os.path.join(path, each_video) - date_time = datetime.datetime.now() # FIXME CHECK: is this correct? - #date_time = t.ctime(os.stat(video_path).st_ctime) + date_time = datetime.datetime.now() # FIXME CHECK: is this correct? + # date_time = t.ctime(os.stat(video_path).st_ctime) # Get the local timezone of the machine local_timezone = get_localzone() - if emb_type == 'video': + if emb_type == "video": time_format = "%a %b %d %H:%M:%S %Y" if not isinstance(date_time, datetime.datetime): date_time = datetime.datetime.strptime(date_time, time_format) @@ -74,46 +79,62 @@ def process_all_videos(config): if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration: interval_count = 0 metadata.pop(each_video) - for start_frame, end_frame, start_time, end_time in calculate_intervals(video_path, chunk_duration, clip_duration): - keyname = os.path.splitext(os.path.basename(video_path))[0]+f"_interval_{interval_count}" - metadata[keyname] = {"timestamp":start_time} - metadata[keyname].update({"date": date, "year": year, "month": month, "day": day, - "time": time, "hours": hours, "minutes": minutes, "seconds": seconds}) - if selected_db == 'vdms': + for start_frame, end_frame, start_time, end_time in calculate_intervals( + video_path, chunk_duration, clip_duration + ): + keyname = os.path.splitext(os.path.basename(video_path))[0] + f"_interval_{interval_count}" + metadata[keyname] = {"timestamp": start_time} + metadata[keyname].update( + { + "date": date, + "year": year, + "month": month, + "day": day, + "time": time, + "hours": hours, + "minutes": minutes, + "seconds": seconds, + } + ) + if selected_db == "vdms": # Localize the current time to the local timezone of the machine - #Tahani might not need this + # Tahani might not need this current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone) # Convert the localized time to ISO 8601 format with timezone offset iso_date_time = current_time_local.isoformat() - metadata[keyname]['date_time'] = {"_date": str(iso_date_time)} + metadata[keyname]["date_time"] = {"_date": str(iso_date_time)} # Open the video file cap = cv2.VideoCapture(video_path) - if int(cv2.__version__.split('.')[0]) < 3: + if int(cv2.__version__.split(".")[0]) < 3: fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) else: fps = cap.get(cv2.CAP_PROP_FPS) - + total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) # get the duration - metadata[keyname].update({ - "clip_duration":(min(total_frames,end_frame)-start_frame)/fps, - 'fps': fps, - 'total_frames': total_frames, + metadata[keyname].update( + { + "clip_duration": (min(total_frames, end_frame) - start_frame) / fps, + "fps": fps, + "total_frames": total_frames, #'embedding_path': os.path.join(emb_path, each_video+".pt"), - 'video_path': f'{os.path.join(path,each_video)}', - }) + "video_path": f"{os.path.join(path,each_video)}", + } + ) cap.release() - interval_count+=1 - metadata[keyname].update({ - 'fps': fps, - 'total_frames': total_frames, + interval_count += 1 + metadata[keyname].update( + { + "fps": fps, + "total_frames": total_frames, #'embedding_path': os.path.join(emb_path, each_video+".pt"), - 'video_path': f'{os.path.join(path,each_video)}', - }) + "video_path": f"{os.path.join(path,each_video)}", + } + ) os.makedirs(meta_output_dir, exist_ok=True) - metadata_file = os.path.join(meta_output_dir, f"metadata.json") + metadata_file = os.path.join(meta_output_dir, "metadata.json") with open(metadata_file, "w") as f: - json.dump(metadata, f, indent=4) \ No newline at end of file + json.dump(metadata, f, indent=4) diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py index 44d290397..89e5830d6 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py @@ -1,16 +1,23 @@ -import yaml -import json -import os, sys +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse -import torch +import json +import os +import sys + import numpy as np -from decord import VideoReader, cpu -from transformers import AutoTokenizer, AutoProcessor, CLIPModel +import torch import torchvision.transforms as T +import yaml +from decord import VideoReader, cpu +from transformers import AutoProcessor, AutoTokenizer, CLIPModel + toPIL = T.ToPILImage() import torch.nn as nn from einops import rearrange + class vCLIP(nn.Module): def __init__(self, cfg): super().__init__() @@ -22,37 +29,28 @@ def __init__(self, cfg): self.processor = AutoProcessor.from_pretrained(self.model_name) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - def get_text_embeddings(self, texts): - """ - input is list of texts - """ + """Input is list of texts.""" text_inputs = self.tokenizer(texts, padding=True, return_tensors="pt") text_features = self.clip.get_text_features(**text_inputs) return text_features - def get_image_embeddings(self, images): - """ - input is list of images - """ + """Input is list of images.""" image_inputs = self.processor(images=images, return_tensors="pt") image_features = self.clip.get_image_features(**image_inputs) - return image_features - + return image_features def get_video_embeddings(self, frames_batch): - """ - input is list of list of frames in video - """ + """Input is list of list of frames in video.""" self.batch_size = len(frames_batch) vid_embs = [] for frames in frames_batch: frame_embeddings = self.get_image_embeddings(frames) frame_embeddings = rearrange(frame_embeddings, "(b n) d -> b n d", b=len(frames_batch)) # Normalize, mean aggregate and return normalized video_embeddings - frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True) + frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True) video_embeddings = frame_embeddings.mean(dim=1) video_embeddings = video_embeddings / video_embeddings.norm(dim=-1, keepdim=True) vid_embs.append(video_embeddings) - return torch.cat(vid_embs, dim=0) \ No newline at end of file + return torch.cat(vid_embs, dim=0) From 200e3187bd3beb5ad0f1a459aef421fa58e5b2c0 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Thu, 5 Sep 2024 17:54:01 +0800 Subject: [PATCH 09/29] add dim at init, rm unused Signed-off-by: BaoHuiling --- .../vdms/multimodal_langchain/config.yaml | 4 +- .../multimodal_langchain/ingest_videos.py | 90 ++++++-------- .../utils/store_embeddings.py | 7 +- .../vdms/multimodal_langchain/utils/utils.py | 117 +++++++++--------- 4 files changed, 101 insertions(+), 117 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml index 40c327615..b2f420180 100644 --- a/comps/dataprep/vdms/multimodal_langchain/config.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -7,10 +7,10 @@ videos: uploaded_files/ generate_frames: True # How do you want to generate feature embeddings? embeddings: - type: "video" vclip_model_name: "openai/clip-vit-base-patch32" vclip_num_frm: 64 - path: "uploaded_files/embeddings" + vector_dimensions: 512 + path: 'uploaded_files/embeddings' # VL-branch config vl_branch: cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index eeadcb1e1..ecfda8a50 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -32,32 +32,28 @@ def read_json(path): x = json.load(f) return x - -def store_into_vectordb(vs, metadata_file_path, embedding_model, config): +def store_into_vectordb(vs, metadata_file_path, dimensions): GMetadata = read_json(metadata_file_path) - global_counter = 0 total_videos = len(GMetadata.keys()) for idx, (video, data) in enumerate(tqdm(GMetadata.items())): - image_name_list = [] - embedding_list = [] metadata_list = [] ids = [] - - if config["embeddings"]["type"] == "video": - data["video"] = video - video_name_list = [data["video_path"]] - metadata_list = [data] - if vs.selected_db == "vdms": - vs.video_db.add_videos( - paths=video_name_list, - metadatas=metadata_list, - start_time=[data["timestamp"]], - clip_duration=[data["clip_duration"]], - ) - else: - print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]") + + data['video'] = video + video_name_list = [data["video_path"]] + metadata_list = [data] + if vs.selected_db == 'vdms': + vs.video_db.add_videos( + paths=video_name_list, + metadatas=metadata_list, + start_time=[data['timestamp']], + clip_duration=[data['clip_duration']], + embedding_dimensions=dimensions, + ) + else: + print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]") # clean up tmp_ folders containing frames (jpeg) for i in os.listdir(): @@ -71,15 +67,19 @@ def generate_video_id(): """Generates a unique identifier for a video file.""" return str(uuid.uuid4()) - -def generate_embeddings(config, embedding_model, vs): +def generate_embeddings(config, dimensions, vs): process_all_videos(config) - global_metadata_file_path = os.path.join(config["meta_output_dir"], "metadata.json") - print(f"global metadata file available at {global_metadata_file_path}") - store_into_vectordb(vs, global_metadata_file_path, embedding_model, config) - + global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json') + print(f'global metadata file available at {global_metadata_file_path}') + store_into_vectordb(vs, global_metadata_file_path, dimensions) + +@register_microservice( + name="opea_service@prepare_videodoc_vdms", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007 +) -@register_microservice(name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) async def process_videos(files: List[UploadFile] = File(None)): """Ingest videos to VDMS.""" @@ -94,7 +94,8 @@ async def process_videos(files: List[UploadFile] = File(None)): emb_path = config["embeddings"]["path"] host = VECTORDB_SERVICE_HOST_IP port = int(VECTORDB_SERVICE_PORT) - selected_db = config["vector_db"]["choice_of_db"] + selected_db = config['vector_db']['choice_of_db'] + vector_dimensions = config["embeddings"]["vector_dimensions"] print(f"Parsing videos {path}.") # Saving videos @@ -118,30 +119,17 @@ async def process_videos(files: List[UploadFile] = File(None)): shutil.copyfileobj(video_file.file, f) # Creating DB - print( - "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time." - ) - print("Connecting to {} at {}:{}".format(selected_db, host, port)) - # check embedding type - if "video" == "video": - # init meanclip model - model = setup_vclip_model(meanclip_cfg, device="cpu") - vs = store_embeddings.VideoVS(host, port, selected_db, model, collection_name) - - else: - print( - f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in ['video', 'frame']" - ) - return - generate_embeddings(config, model, vs) + print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.') + print('Connecting to {} at {}:{}'.format(selected_db, host, port)) + # init meanclip model + model = setup_vclip_model(meanclip_cfg, device="cpu") + vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions) + generate_embeddings(config, vector_dimensions, vs) + @register_microservice( - name="opea_service@prepare_videodoc_vdms", - endpoint="/v1/dataprep/get_videos", - host="0.0.0.0", - port=6007, - methods=["GET"], + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007, methods=["GET"] ) async def rag_get_file_structure(): """Returns list of names of uploaded videos saved on the server.""" @@ -156,11 +144,7 @@ async def rag_get_file_structure(): @register_microservice( - name="opea_service@prepare_videodoc_vdms", - endpoint="/v1/dataprep/get_file/{filename}", - host="0.0.0.0", - port=6007, - methods=["GET"], + name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007, methods=["GET"] ) async def rag_get_file(filename: str): """Download the file from remote.""" diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py index 6468e5195..3ff5e21ee 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py @@ -94,9 +94,8 @@ def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs): class VideoVS: - def __init__( - self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity" - ): + def __init__(self, host, port, selected_db, video_retriever_model, collection_name, embedding_dimensions:int = 512, chosen_video_search_type="similarity"): + self.host = host self.port = port self.selected_db = selected_db @@ -105,6 +104,7 @@ def __init__( self.video_collection = collection_name self.video_embedder = vCLIPEmbeddings(model=video_retriever_model) self.chosen_video_search_type = chosen_video_search_type + self.embedding_dimensions = embedding_dimensions # initialize_db self.get_db_client() @@ -125,4 +125,5 @@ def init_db(self): collection_name=self.video_collection, engine="FaissFlat", distance_strategy="IP", + embedding_dimensions=self.embedding_dimensions ) diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py index d83c0319f..ba661cffd 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py @@ -67,65 +67,64 @@ def process_all_videos(config): # date_time = t.ctime(os.stat(video_path).st_ctime) # Get the local timezone of the machine local_timezone = get_localzone() - if emb_type == "video": - time_format = "%a %b %d %H:%M:%S %Y" - if not isinstance(date_time, datetime.datetime): - date_time = datetime.datetime.strptime(date_time, time_format) - time = date_time.strftime("%H:%M:%S") - hours, minutes, seconds = map(float, time.split(":")) - date = date_time.strftime("%Y-%m-%d") - year, month, day = map(int, date.split("-")) - - if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration: - interval_count = 0 - metadata.pop(each_video) - for start_frame, end_frame, start_time, end_time in calculate_intervals( - video_path, chunk_duration, clip_duration - ): - keyname = os.path.splitext(os.path.basename(video_path))[0] + f"_interval_{interval_count}" - metadata[keyname] = {"timestamp": start_time} - metadata[keyname].update( - { - "date": date, - "year": year, - "month": month, - "day": day, - "time": time, - "hours": hours, - "minutes": minutes, - "seconds": seconds, - } - ) - if selected_db == "vdms": - # Localize the current time to the local timezone of the machine - # Tahani might not need this - current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone) - - # Convert the localized time to ISO 8601 format with timezone offset - iso_date_time = current_time_local.isoformat() - metadata[keyname]["date_time"] = {"_date": str(iso_date_time)} - - # Open the video file - cap = cv2.VideoCapture(video_path) - - if int(cv2.__version__.split(".")[0]) < 3: - fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) - else: - fps = cap.get(cv2.CAP_PROP_FPS) - - total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) - # get the duration - metadata[keyname].update( - { - "clip_duration": (min(total_frames, end_frame) - start_frame) / fps, - "fps": fps, - "total_frames": total_frames, - #'embedding_path': os.path.join(emb_path, each_video+".pt"), - "video_path": f"{os.path.join(path,each_video)}", - } - ) - cap.release() - interval_count += 1 + time_format = "%a %b %d %H:%M:%S %Y" + if not isinstance(date_time, datetime.datetime): + date_time = datetime.datetime.strptime(date_time, time_format) + time = date_time.strftime("%H:%M:%S") + hours, minutes, seconds = map(float, time.split(":")) + date = date_time.strftime("%Y-%m-%d") + year, month, day = map(int, date.split("-")) + + if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration: + interval_count = 0 + metadata.pop(each_video) + for start_frame, end_frame, start_time, end_time in calculate_intervals( + video_path, chunk_duration, clip_duration + ): + keyname = os.path.splitext(os.path.basename(video_path))[0] + f"_interval_{interval_count}" + metadata[keyname] = {"timestamp": start_time} + metadata[keyname].update( + { + "date": date, + "year": year, + "month": month, + "day": day, + "time": time, + "hours": hours, + "minutes": minutes, + "seconds": seconds, + } + ) + if selected_db == "vdms": + # Localize the current time to the local timezone of the machine + # Tahani might not need this + current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone) + + # Convert the localized time to ISO 8601 format with timezone offset + iso_date_time = current_time_local.isoformat() + metadata[keyname]["date_time"] = {"_date": str(iso_date_time)} + + # Open the video file + cap = cv2.VideoCapture(video_path) + + if int(cv2.__version__.split(".")[0]) < 3: + fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) + else: + fps = cap.get(cv2.CAP_PROP_FPS) + + total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) + # get the duration + metadata[keyname].update( + { + "clip_duration": (min(total_frames, end_frame) - start_frame) / fps, + "fps": fps, + "total_frames": total_frames, + #'embedding_path': os.path.join(emb_path, each_video+".pt"), + "video_path": f"{os.path.join(path,each_video)}", + } + ) + cap.release() + interval_count += 1 metadata[keyname].update( { "fps": fps, From c6e12f1c4f56e85cbaf130b0ae2490fc5662d74c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Sep 2024 09:55:00 +0000 Subject: [PATCH 10/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../vdms/multimodal_langchain/config.yaml | 2 +- .../multimodal_langchain/ingest_videos.py | 52 +++++++++++-------- .../utils/store_embeddings.py | 15 ++++-- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml index b2f420180..43ce11f4f 100644 --- a/comps/dataprep/vdms/multimodal_langchain/config.yaml +++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml @@ -10,7 +10,7 @@ embeddings: vclip_model_name: "openai/clip-vit-base-patch32" vclip_num_frm: 64 vector_dimensions: 512 - path: 'uploaded_files/embeddings' + path: "uploaded_files/embeddings" # VL-branch config vl_branch: cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index ecfda8a50..6092ca53f 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -32,6 +32,7 @@ def read_json(path): x = json.load(f) return x + def store_into_vectordb(vs, metadata_file_path, dimensions): GMetadata = read_json(metadata_file_path) @@ -40,16 +41,16 @@ def store_into_vectordb(vs, metadata_file_path, dimensions): for idx, (video, data) in enumerate(tqdm(GMetadata.items())): metadata_list = [] ids = [] - - data['video'] = video + + data["video"] = video video_name_list = [data["video_path"]] metadata_list = [data] - if vs.selected_db == 'vdms': + if vs.selected_db == "vdms": vs.video_db.add_videos( paths=video_name_list, metadatas=metadata_list, - start_time=[data['timestamp']], - clip_duration=[data['clip_duration']], + start_time=[data["timestamp"]], + clip_duration=[data["clip_duration"]], embedding_dimensions=dimensions, ) else: @@ -67,19 +68,15 @@ def generate_video_id(): """Generates a unique identifier for a video file.""" return str(uuid.uuid4()) + def generate_embeddings(config, dimensions, vs): process_all_videos(config) - global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json') - print(f'global metadata file available at {global_metadata_file_path}') + global_metadata_file_path = os.path.join(config["meta_output_dir"], "metadata.json") + print(f"global metadata file available at {global_metadata_file_path}") store_into_vectordb(vs, global_metadata_file_path, dimensions) - -@register_microservice( - name="opea_service@prepare_videodoc_vdms", - endpoint="/v1/dataprep", - host="0.0.0.0", - port=6007 -) + +@register_microservice(name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) async def process_videos(files: List[UploadFile] = File(None)): """Ingest videos to VDMS.""" @@ -94,7 +91,7 @@ async def process_videos(files: List[UploadFile] = File(None)): emb_path = config["embeddings"]["path"] host = VECTORDB_SERVICE_HOST_IP port = int(VECTORDB_SERVICE_PORT) - selected_db = config['vector_db']['choice_of_db'] + selected_db = config["vector_db"]["choice_of_db"] vector_dimensions = config["embeddings"]["vector_dimensions"] print(f"Parsing videos {path}.") @@ -119,17 +116,26 @@ async def process_videos(files: List[UploadFile] = File(None)): shutil.copyfileobj(video_file.file, f) # Creating DB - print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.') - print('Connecting to {} at {}:{}'.format(selected_db, host, port)) + print( + "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time." + ) + print("Connecting to {} at {}:{}".format(selected_db, host, port)) # init meanclip model model = setup_vclip_model(meanclip_cfg, device="cpu") - vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions) + vs = store_embeddings.VideoVS( + host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions + ) generate_embeddings(config, vector_dimensions, vs) - + + @register_microservice( - name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007, methods=["GET"] + name="opea_service@prepare_videodoc_vdms", + endpoint="/v1/dataprep/get_videos", + host="0.0.0.0", + port=6007, + methods=["GET"], ) async def rag_get_file_structure(): """Returns list of names of uploaded videos saved on the server.""" @@ -144,7 +150,11 @@ async def rag_get_file_structure(): @register_microservice( - name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007, methods=["GET"] + name="opea_service@prepare_videodoc_vdms", + endpoint="/v1/dataprep/get_file/{filename}", + host="0.0.0.0", + port=6007, + methods=["GET"], ) async def rag_get_file(filename: str): """Download the file from remote.""" diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py index 3ff5e21ee..ce364e669 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py @@ -94,8 +94,17 @@ def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs): class VideoVS: - def __init__(self, host, port, selected_db, video_retriever_model, collection_name, embedding_dimensions:int = 512, chosen_video_search_type="similarity"): - + def __init__( + self, + host, + port, + selected_db, + video_retriever_model, + collection_name, + embedding_dimensions: int = 512, + chosen_video_search_type="similarity", + ): + self.host = host self.port = port self.selected_db = selected_db @@ -125,5 +134,5 @@ def init_db(self): collection_name=self.video_collection, engine="FaissFlat", distance_strategy="IP", - embedding_dimensions=self.embedding_dimensions + embedding_dimensions=self.embedding_dimensions, ) From b07036e32f5017b7954b9dbeabc822c48b3f80f9 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Fri, 6 Sep 2024 17:32:16 +0800 Subject: [PATCH 11/29] add wait after connect DB Signed-off-by: BaoHuiling --- .../multimodal_langchain/ingest_videos.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index 6092ca53f..ab7e45d0c 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -7,6 +7,7 @@ import uuid from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Type, Union +import time from fastapi import File, HTTPException, UploadFile from fastapi.responses import FileResponse @@ -49,9 +50,8 @@ def store_into_vectordb(vs, metadata_file_path, dimensions): vs.video_db.add_videos( paths=video_name_list, metadatas=metadata_list, - start_time=[data["timestamp"]], - clip_duration=[data["clip_duration"]], - embedding_dimensions=dimensions, + start_time=[data['timestamp']], + clip_duration=[data['clip_duration']] ) else: print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]") @@ -116,17 +116,15 @@ async def process_videos(files: List[UploadFile] = File(None)): shutil.copyfileobj(video_file.file, f) # Creating DB - print( - "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time." - ) - print("Connecting to {} at {}:{}".format(selected_db, host, port)) + print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.', flush=True) + print('Connecting to {} at {}:{}'.format(selected_db, host, port), flush=True) # init meanclip model model = setup_vclip_model(meanclip_cfg, device="cpu") - vs = store_embeddings.VideoVS( - host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions - ) - + vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions) + print("done creating DB, sleep 5s", flush=True) + time.sleep(5) + generate_embeddings(config, vector_dimensions, vs) From 0afc7b56b4c2b76c31f852f5cffb045f14ecffb2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:34:38 +0000 Subject: [PATCH 12/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multimodal_langchain/ingest_videos.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index ab7e45d0c..bb8612c41 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -4,10 +4,10 @@ import json import os import shutil +import time import uuid from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Type, Union -import time from fastapi import File, HTTPException, UploadFile from fastapi.responses import FileResponse @@ -50,8 +50,8 @@ def store_into_vectordb(vs, metadata_file_path, dimensions): vs.video_db.add_videos( paths=video_name_list, metadatas=metadata_list, - start_time=[data['timestamp']], - clip_duration=[data['clip_duration']] + start_time=[data["timestamp"]], + clip_duration=[data["clip_duration"]], ) else: print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]") @@ -116,15 +116,20 @@ async def process_videos(files: List[UploadFile] = File(None)): shutil.copyfileobj(video_file.file, f) # Creating DB - print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.', flush=True) - print('Connecting to {} at {}:{}'.format(selected_db, host, port), flush=True) + print( + "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.", + flush=True, + ) + print("Connecting to {} at {}:{}".format(selected_db, host, port), flush=True) # init meanclip model model = setup_vclip_model(meanclip_cfg, device="cpu") - vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions) + vs = store_embeddings.VideoVS( + host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions + ) print("done creating DB, sleep 5s", flush=True) time.sleep(5) - + generate_embeddings(config, vector_dimensions, vs) From 9261a4ad00df571cc96b7b8e6500ba3058138985 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Fri, 6 Sep 2024 20:56:22 +0800 Subject: [PATCH 13/29] remove unused Signed-off-by: BaoHuiling --- comps/dataprep/vdms/multimodal_langchain/utils/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py index ba661cffd..3bb991395 100644 --- a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py +++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py @@ -50,7 +50,6 @@ def process_all_videos(config): meta_output_dir = config["meta_output_dir"] selected_db = config["vector_db"]["choice_of_db"] emb_path = config["embeddings"]["path"] - emb_type = config["embeddings"]["type"] chunk_duration = config["chunk_duration"] clip_duration = config["clip_duration"] From b06006a0e08d6e2c4f5f293020931d685be4f2bd Mon Sep 17 00:00:00 2001 From: Huiling Bao Date: Tue, 10 Sep 2024 13:35:20 +0800 Subject: [PATCH 14/29] Update comps/dataprep/vdms/README.md Co-authored-by: XinyuYe-Intel Signed-off-by: BaoHuiling --- comps/dataprep/vdms/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/vdms/README.md b/comps/dataprep/vdms/README.md index 617761f02..71c35df8a 100644 --- a/comps/dataprep/vdms/README.md +++ b/comps/dataprep/vdms/README.md @@ -141,7 +141,7 @@ You can specify chunk_size and chunk_size by the following commands. ```bash curl -X POST \ -H "Content-Type: multipart/form-data" \ - -F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \ + -F "files=@./LLAMA2_page6.pdf" \ -F "chunk_size=1500" \ -F "chunk_overlap=100" \ http://localhost:6007/v1/dataprep From 56c578f9c2f254e7d1deb6444f994321c36508db Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Tue, 10 Sep 2024 15:45:06 +0800 Subject: [PATCH 15/29] add test script for mm case Signed-off-by: BaoHuiling --- ...test_dataprep_vdms_multimodal_langchain.sh | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100755 tests/test_dataprep_vdms_multimodal_langchain.sh diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh new file mode 100755 index 000000000..91dfb5f68 --- /dev/null +++ b/tests/test_dataprep_vdms_multimodal_langchain.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/dataprep-vdms built fail" + exit 1 + else + echo "opea/dataprep-vdms built successful" + fi + docker pull intellabs/vdms:latest +} + +function start_service() { + VDMS_PORT=5043 + docker run -d --name="test-comps-dataprep-vdms" -p $VDMS_PORT:55555 intellabs/vdms:latest + dataprep_service_port=5013 + COLLECTION_NAME="test-comps" + docker run -d --name="test-comps-dataprep-vdms-server" -e COLLECTION_NAME=$COLLECTION_NAME -e no_proxy=$no_proxy -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VDMS_HOST=$ip_address -e VDMS_PORT=$VDMS_PORT -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-vdms:comps + sleep 30s +} + +function validate_microservice() { + cd $LOG_PATH + wget -q https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4 + dataprep_service_port=5013 + + # test /v1/dataprep upload file + URL="http://$ip_address:$dataprep_service_port/v1/dataprep" + HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} ) + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep-upload-videos ] HTTP status is 200. Checking content..." + local CONTENT=$(http_proxy="" curl -s -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} | tee ${LOG_PATH}/dataprep-upload-videos.log) + if echo "$CONTENT" | grep "Videos ingested successfully"; then + echo "[ dataprep-upload-videos ] Content is correct." + else + echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log + docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log + exit 1 + fi + else + echo "[ dataprep-upload-videos ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log + docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log + exit 1 + fi + rm ./silence_girl.mp4 + + # test /v1/dataprep/get_videos + URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos" + + HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL}) + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep-get-videos ]HTTP status is 200. Checking content..." + local CONTENT=$(http_proxy="" curl -s -X GET ${URL} | tee ${LOG_PATH}/dataprep-get-videos.log) + if echo "$CONTENT" | grep "silence_girl"; then + echo "[ dataprep-get-videos ] Content is correct." + else + echo "[ dataprep-get-videos ] Content is not correct. Received content was $CONTENT" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log + exit 1 + fi + else + echo "[ dataprep-get-videos ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log + exit 1 + fi + + # test /v1/dataprep/get_file/{filename} + file_list=$(http_proxy="" curl -s -X GET http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos) + echo $file_list + filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g') + URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}" + + HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL}) + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ download_file ]HTTP status is 200. Checking content..." + CONTENT=$(ls -l) + if echo "$CONTENT" | grep "silence_girl"; then + echo "[ download_file ] Content is correct." + else + echo "[ download_file ] Content is not correct. $CONTENT" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log + exit 1 + fi + else + echo "[ download_file ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-vdms*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From dc11dc2588e2a12782dfbc69241f3c02ac6b1ee2 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Tue, 10 Sep 2024 15:45:31 +0800 Subject: [PATCH 16/29] add return value and update readme Signed-off-by: BaoHuiling --- comps/dataprep/vdms/multimodal_langchain/README.md | 10 +++++++++- .../vdms/multimodal_langchain/ingest_videos.py | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md index 54e878b65..96d719c9f 100644 --- a/comps/dataprep/vdms/multimodal_langchain/README.md +++ b/comps/dataprep/vdms/multimodal_langchain/README.md @@ -112,5 +112,13 @@ curl -X POST \ - List of uploaded files ```bash -curl -X POST http://localhost:6007/v1/dataprep/get_videos +curl -X GET http://localhost:6007/v1/dataprep/get_videos +``` + +- Download uploaded files + +Please use the file name from the list + +```bash +curl -X GET http://localhost:6007/v1/dataprep/get_file/${filename} ``` diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index bb8612c41..91bd5d9be 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -131,6 +131,8 @@ async def process_videos(files: List[UploadFile] = File(None)): time.sleep(5) generate_embeddings(config, vector_dimensions, vs) + + return {"message": "Videos ingested successfully"} @register_microservice( From 04e12249734922a180f2d9c77f2516ab90b39b90 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 07:46:13 +0000 Subject: [PATCH 17/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/vdms/multimodal_langchain/ingest_videos.py | 2 +- tests/test_dataprep_vdms_multimodal_langchain.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py index 91bd5d9be..132913e3f 100644 --- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py +++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py @@ -131,7 +131,7 @@ async def process_videos(files: List[UploadFile] = File(None)): time.sleep(5) generate_embeddings(config, vector_dimensions, vs) - + return {"message": "Videos ingested successfully"} diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh index 91dfb5f68..686269da2 100755 --- a/tests/test_dataprep_vdms_multimodal_langchain.sh +++ b/tests/test_dataprep_vdms_multimodal_langchain.sh @@ -60,7 +60,7 @@ function validate_microservice() { # test /v1/dataprep/get_videos URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos" - + HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL}) if [ "$HTTP_STATUS" -eq 200 ]; then echo "[ dataprep-get-videos ]HTTP status is 200. Checking content..." @@ -77,13 +77,13 @@ function validate_microservice() { docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log exit 1 fi - + # test /v1/dataprep/get_file/{filename} file_list=$(http_proxy="" curl -s -X GET http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos) echo $file_list filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g') URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}" - + HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL}) if [ "$HTTP_STATUS" -eq 200 ]; then echo "[ download_file ]HTTP status is 200. Checking content..." From ea465e4d933daa10ccc30abf1351e05c8e726ba0 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Tue, 10 Sep 2024 17:05:14 +0800 Subject: [PATCH 18/29] check bug Signed-off-by: BaoHuiling --- ...test_dataprep_vdms_multimodal_langchain.sh | 42 +++++++------------ 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh index 686269da2..a079f8391 100755 --- a/tests/test_dataprep_vdms_multimodal_langchain.sh +++ b/tests/test_dataprep_vdms_multimodal_langchain.sh @@ -33,29 +33,23 @@ function start_service() { function validate_microservice() { cd $LOG_PATH - wget -q https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4 - dataprep_service_port=5013 + wget https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4 -O silence_girl.mp4 + ls && sleep 5 # test /v1/dataprep upload file URL="http://$ip_address:$dataprep_service_port/v1/dataprep" - HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} ) - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ dataprep-upload-videos ] HTTP status is 200. Checking content..." - local CONTENT=$(http_proxy="" curl -s -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} | tee ${LOG_PATH}/dataprep-upload-videos.log) - if echo "$CONTENT" | grep "Videos ingested successfully"; then - echo "[ dataprep-upload-videos ] Content is correct." - else - echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT" - docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log - docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log - exit 1 - fi + CONTENT=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL}) + echo "[ dataprep-upload-videos ] Checking content..." + if echo "$CONTENT" | grep "Videos ingested successfully"; then + echo "[ dataprep-upload-videos ] Content is correct." else - echo "[ dataprep-upload-videos ] HTTP status is not 200. Received status was $HTTP_STATUS" + echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT" docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log exit 1 fi + + sleep 1s rm ./silence_girl.mp4 # test /v1/dataprep/get_videos @@ -84,22 +78,16 @@ function validate_microservice() { filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g') URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}" - HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL}) - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ download_file ]HTTP status is 200. Checking content..." - CONTENT=$(ls -l) - if echo "$CONTENT" | grep "silence_girl"; then - echo "[ download_file ] Content is correct." - else - echo "[ download_file ] Content is not correct. $CONTENT" - docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log - exit 1 - fi + http_proxy="" wget ${URL} + CONTENT=$(ls) + if echo "$CONTENT" | grep "silence_girl"; then + echo "[ download_file ] Content is correct." else - echo "[ download_file ] HTTP status is not 200. Received status was $HTTP_STATUS" + echo "[ download_file ] Content is not correct. $CONTENT" docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log exit 1 fi + } function stop_docker() { From acc7a05d1655fff4ec68c53c1943c273cf7f8ea6 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Tue, 10 Sep 2024 19:01:34 +0800 Subject: [PATCH 19/29] fix mm-script Signed-off-by: BaoHuiling --- ...test_dataprep_vdms_multimodal_langchain.sh | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh index a079f8391..e53e528b5 100755 --- a/tests/test_dataprep_vdms_multimodal_langchain.sh +++ b/tests/test_dataprep_vdms_multimodal_langchain.sh @@ -34,17 +34,28 @@ function start_service() { function validate_microservice() { cd $LOG_PATH wget https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4 -O silence_girl.mp4 - ls && sleep 5 + sleep 5 # test /v1/dataprep upload file URL="http://$ip_address:$dataprep_service_port/v1/dataprep" - CONTENT=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL}) - echo "[ dataprep-upload-videos ] Checking content..." - if echo "$CONTENT" | grep "Videos ingested successfully"; then - echo "[ dataprep-upload-videos ] Content is correct." + + response=$(http_proxy="" curl -s -w "\n%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL}) + CONTENT=$(echo "$response" | sed -e '$ d') + HTTP_STATUS=$(echo "$response" | tail -n 1) + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep-upload-videos ] HTTP status is 200. Checking content..." + if echo "$CONTENT" | grep "Videos ingested successfully"; then + echo "[ dataprep-upload-videos ] Content is correct." + else + echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log + docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log + exit 1 + fi else - echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT" - docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log + echo "[ dataprep-upload-videos ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log exit 1 fi @@ -55,10 +66,12 @@ function validate_microservice() { # test /v1/dataprep/get_videos URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos" - HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL}) + response=$(http_proxy="" curl -s -w "\n%{http_code}" -X GET ${URL}) + CONTENT=$(echo "$response" | sed -e '$ d') + HTTP_STATUS=$(echo "$response" | tail -n 1) + if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ dataprep-get-videos ]HTTP status is 200. Checking content..." - local CONTENT=$(http_proxy="" curl -s -X GET ${URL} | tee ${LOG_PATH}/dataprep-get-videos.log) + echo "[ dataprep-get-videos ] HTTP status is 200. Checking content..." if echo "$CONTENT" | grep "silence_girl"; then echo "[ dataprep-get-videos ] Content is correct." else @@ -73,8 +86,7 @@ function validate_microservice() { fi # test /v1/dataprep/get_file/{filename} - file_list=$(http_proxy="" curl -s -X GET http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos) - echo $file_list + file_list=$CONTENT filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g') URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}" From a66da363a4ad869bf9f5fafc405c7cb3554bb091 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Tue, 10 Sep 2024 19:44:37 +0800 Subject: [PATCH 20/29] add into dataprep workflow Signed-off-by: BaoHuiling --- .github/workflows/docker/compose/dataprep-compose-cd.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml index e17783051..83c053dd6 100644 --- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml +++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml @@ -23,3 +23,6 @@ services: build: dockerfile: comps/dataprep/pinecone/docker/Dockerfile image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest} + dataprep-vdms: + build: + dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile From 2699710e3d61a033fb5af54d111c1eb13eac3291 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Tue, 10 Sep 2024 19:48:08 +0800 Subject: [PATCH 21/29] rm whitespace Signed-off-by: BaoHuiling --- .github/workflows/docker/compose/dataprep-compose-cd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml index 83c053dd6..384f0ceca 100644 --- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml +++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml @@ -25,4 +25,4 @@ services: image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest} dataprep-vdms: build: - dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile + dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile From ebe5a91927f3ace896b2740d3aeceab78f0a2bf8 Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 10 Sep 2024 16:16:29 -0700 Subject: [PATCH 22/29] updated readme and added test script Signed-off-by: srinarayan-srikanthan --- tests/dataprep-upload-file.log | 1 + tests/test_dataprep_vdms_langchain.sh | 83 +++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tests/dataprep-upload-file.log create mode 100644 tests/test_dataprep_vdms_langchain.sh diff --git a/tests/dataprep-upload-file.log b/tests/dataprep-upload-file.log new file mode 100644 index 000000000..d0c4bf41f --- /dev/null +++ b/tests/dataprep-upload-file.log @@ -0,0 +1 @@ +{"status":200,"message":"Data preparation succeeded"} \ No newline at end of file diff --git a/tests/test_dataprep_vdms_langchain.sh b/tests/test_dataprep_vdms_langchain.sh new file mode 100644 index 000000000..817d0a89d --- /dev/null +++ b/tests/test_dataprep_vdms_langchain.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/dataprep-vdms built fail" + exit 1 + else + echo "opea/dataprep-vdms built successful" + fi + docker pull intellabs/vdms:latest +} + +function start_service() { + VDMS_PORT=5043 + docker run -d --name="test-comps-dataprep-vdms" -p $VDMS_PORT:55555 intellabs/vdms:latest + dataprep_service_port=5013 + COLLECTION_NAME="test-comps" + docker run -d --name="test-comps-dataprep-vdms-server" -e COLLECTION_NAME=$COLLECTION_NAME -e no_proxy=$no_proxy -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VDMS_HOST=$ip_address -e VDMS_PORT=$VDMS_PORT -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-vdms:comps + sleep 30s +} + +function validate_microservice() { + cd $LOG_PATH + + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + + dataprep_service_port=5013 + + URL="http://$ip_address:$dataprep_service_port/v1/dataprep" + HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' ${URL} ) + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep-upload-file ] HTTP status is 200. Checking content..." + local CONTENT=$(http_proxy="" curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' ${URL} | tee ${LOG_PATH}/dataprep-upload-file.log) + if echo "$CONTENT" | grep "Data preparation succeeded"; then + echo "[ dataprep-upload-file ] Content is correct." + else + echo "[ dataprep-upload-file ] Content is not correct. Received content was $CONTENT" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-file.log + docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-file_vdms.log + exit 1 + fi + else + echo "[ dataprep-upload-file ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-file.log + docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-file_vdms.log + exit 1 + fi + rm ./dataprep_file.txt + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-vdms*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 2b6f6d5557085efa4a38b8fef45ca169befe4781 Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 10 Sep 2024 16:17:11 -0700 Subject: [PATCH 23/29] removed unused file Signed-off-by: srinarayan-srikanthan --- tests/dataprep-upload-file.log | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tests/dataprep-upload-file.log diff --git a/tests/dataprep-upload-file.log b/tests/dataprep-upload-file.log deleted file mode 100644 index d0c4bf41f..000000000 --- a/tests/dataprep-upload-file.log +++ /dev/null @@ -1 +0,0 @@ -{"status":200,"message":"Data preparation succeeded"} \ No newline at end of file From 808f1f7bba270c9fd545b85805643cbf3e02d920 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 23:20:54 +0000 Subject: [PATCH 24/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_dataprep_vdms_langchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataprep_vdms_langchain.sh b/tests/test_dataprep_vdms_langchain.sh index 817d0a89d..7a7d14fe4 100644 --- a/tests/test_dataprep_vdms_langchain.sh +++ b/tests/test_dataprep_vdms_langchain.sh @@ -33,7 +33,7 @@ function start_service() { function validate_microservice() { cd $LOG_PATH - + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt dataprep_service_port=5013 From 9fe2571b96fbc40f73af855a72d313481455ca04 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Wed, 11 Sep 2024 07:39:04 +0800 Subject: [PATCH 25/29] move test script Signed-off-by: BaoHuiling --- tests/{ => dataprep}/test_dataprep_vdms_langchain.sh | 0 tests/{ => dataprep}/test_dataprep_vdms_multimodal_langchain.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/{ => dataprep}/test_dataprep_vdms_langchain.sh (100%) rename tests/{ => dataprep}/test_dataprep_vdms_multimodal_langchain.sh (100%) diff --git a/tests/test_dataprep_vdms_langchain.sh b/tests/dataprep/test_dataprep_vdms_langchain.sh similarity index 100% rename from tests/test_dataprep_vdms_langchain.sh rename to tests/dataprep/test_dataprep_vdms_langchain.sh diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh similarity index 100% rename from tests/test_dataprep_vdms_multimodal_langchain.sh rename to tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh From ebe7c7d8c2592443d12e31955015e25624aeb51b Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 10 Sep 2024 18:37:36 -0700 Subject: [PATCH 26/29] restructured repo Signed-off-by: srinarayan-srikanthan --- comps/dataprep/vdms/README.md | 16 +-- .../vdms/langchain/{docker => }/Dockerfile | 4 + .../docker-compose-dataprep-vdms.yaml | 0 .../vdms/langchain/prepare_doc_vdms.py | 108 +++++++++++++++--- .../{docker => }/Dockerfile | 0 .../vdms/multimodal_langchain/README.md | 4 +- .../docker-compose-dataprep-vdms.yaml | 0 7 files changed, 106 insertions(+), 26 deletions(-) rename comps/dataprep/vdms/langchain/{docker => }/Dockerfile (86%) rename comps/dataprep/vdms/langchain/{docker => }/docker-compose-dataprep-vdms.yaml (100%) rename comps/dataprep/vdms/multimodal_langchain/{docker => }/Dockerfile (100%) rename comps/dataprep/vdms/multimodal_langchain/{docker => }/docker-compose-dataprep-vdms.yaml (100%) diff --git a/comps/dataprep/vdms/README.md b/comps/dataprep/vdms/README.md index 71c35df8a..2a0d2ca45 100644 --- a/comps/dataprep/vdms/README.md +++ b/comps/dataprep/vdms/README.md @@ -10,7 +10,7 @@ We organized the folders in the same way, so you can use either framework for da ## 1.1 Install Requirements -- option 1: Install Single-process version (for 1-10 files processing) +Install Single-process version (for 1-10 files processing) ```bash apt-get update @@ -46,7 +46,7 @@ export PYTHONPATH=${path_to_comps} Start document preparation microservice for VDMS with below command. -- option 1: Start single-process version (for 1-10 files processing) +Start single-process version (for 1-10 files processing) ```bash python prepare_doc_vdms.py @@ -82,22 +82,22 @@ export PYTHONPATH=${path_to_comps} - Build docker image with langchain -* option 1: Start single-process version (for 1-10 files processing) +Start single-process version (for 1-10 files processing) ```bash cd ../../../ -docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile . +docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/Dockerfile . ``` +docker build -t opea/dataprep-on-ray-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain_ray/Dockerfile . --> ## 2.4 Run Docker with CLI -- option 1: Start single-process version (for 1-10 files processing) +Start single-process version (for 1-10 files processing) ```bash docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \ @@ -127,7 +127,7 @@ Once document preparation microservice for VDMS is started, user can use below c Make sure the file path after `files=@` is correct. - +``` diff --git a/comps/dataprep/vdms/langchain/docker/Dockerfile b/comps/dataprep/vdms/langchain/Dockerfile similarity index 86% rename from comps/dataprep/vdms/langchain/docker/Dockerfile rename to comps/dataprep/vdms/langchain/Dockerfile index 606b0a4e1..df5b75544 100644 --- a/comps/dataprep/vdms/langchain/docker/Dockerfile +++ b/comps/dataprep/vdms/langchain/Dockerfile @@ -28,6 +28,10 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \ ENV PYTHONPATH=/home/user +USER root + +RUN mkdir -p /home/user/comps/dataprep/vdms/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/langchain + USER user WORKDIR /home/user/comps/dataprep/vdms/langchain diff --git a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/langchain/docker-compose-dataprep-vdms.yaml similarity index 100% rename from comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml rename to comps/dataprep/vdms/langchain/docker-compose-dataprep-vdms.yaml diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py index e6f7d0072..8bfc309bb 100644 --- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py +++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py @@ -2,30 +2,24 @@ # SPDX-License-Identifier: Apache-2.0 import os - +import json from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client from langchain_text_splitters import HTMLHeaderTextSplitter - -from comps import DocPath, opea_microservices, opea_telemetry, register_microservice -from comps.dataprep.utils import document_loader, get_separators, get_tables_result +from fastapi import Body, File, Form, HTTPException, UploadFile +from typing import List, Optional, Union +from comps import CustomLogger,DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import document_loader, get_separators, get_tables_result, encode_filename, save_content_to_local_disk, parse_html, create_upload_folder tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") client = VDMS_Client(VDMS_HOST, int(VDMS_PORT)) +logger = CustomLogger("prepare_doc_redis") +logflag = os.getenv("LOGFLAG", False) +upload_folder = "./uploaded_files/" - -@register_microservice( - name="opea_service@prepare_doc_vdms", - endpoint="/v1/dataprep", - host="0.0.0.0", - port=6007, - input_datatype=DocPath, - output_datatype=None, -) -@opea_telemetry -def ingest_documents(doc_path: DocPath): +def ingest_data_to_vdms(doc_path: DocPath): """Ingest document to VDMS.""" path = doc_path.path print(f"Parsing document {doc_path}.") @@ -42,7 +36,7 @@ def ingest_documents(doc_path: DocPath): chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() ) - content = document_loader(doc_path) + content = document_loader(path) chunks = text_splitter.split_text(content) if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) @@ -76,5 +70,87 @@ def ingest_documents(doc_path: DocPath): print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") +@register_microservice( + name="opea_service@prepare_doc_vdms", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"[ upload ] files:{files}") + logger.info(f"[ upload ] link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + + for file in files: + encode_file = encode_filename(file.filename) + doc_id = "file:" + encode_file + if logflag: + logger.info(f"[ upload ] processing file {doc_id}") + + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_vdms( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"[ upload ] Successfully saved file {save_path}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail=f"Link_list {link_list} should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + doc_id = "file:" + encoded_link + ".txt" + if logflag: + logger.info(f"[ upload ] processing link {doc_id}") + + # check whether the link file already exists + + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + await save_content_to_local_disk(save_path, content) + ingest_data_to_vdms( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + if logflag: + logger.info(f"[ upload ] Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + + if __name__ == "__main__": + create_upload_folder(upload_folder) opea_microservices["opea_service@prepare_doc_vdms"].start() diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile b/comps/dataprep/vdms/multimodal_langchain/Dockerfile similarity index 100% rename from comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile rename to comps/dataprep/vdms/multimodal_langchain/Dockerfile diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md index 96d719c9f..0b5b721fa 100644 --- a/comps/dataprep/vdms/multimodal_langchain/README.md +++ b/comps/dataprep/vdms/multimodal_langchain/README.md @@ -67,14 +67,14 @@ export your_hf_api_token="{your_hf_token}" ```bash cd ../../../ - docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile . + docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/Dockerfile . ``` ## 2.4 Run Docker Compose ```bash -docker compose -f comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml up -d +docker compose -f comps/dataprep/vdms/multimodal_langchain/docker-compose-dataprep-vdms.yaml up -d ``` # 🚀3. Status Microservice diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker-compose-dataprep-vdms.yaml similarity index 100% rename from comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml rename to comps/dataprep/vdms/multimodal_langchain/docker-compose-dataprep-vdms.yaml From cb2c033c5a4379347d8900e49df2552b717e9ea3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Sep 2024 01:41:27 +0000 Subject: [PATCH 27/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../vdms/langchain/prepare_doc_vdms.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py index 8bfc309bb..c89c7517f 100644 --- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py +++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py @@ -1,17 +1,27 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os import json +import os +from typing import List, Optional, Union + from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT +from fastapi import Body, File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client from langchain_text_splitters import HTMLHeaderTextSplitter -from fastapi import Body, File, Form, HTTPException, UploadFile -from typing import List, Optional, Union -from comps import CustomLogger,DocPath, opea_microservices, register_microservice -from comps.dataprep.utils import document_loader, get_separators, get_tables_result, encode_filename, save_content_to_local_disk, parse_html, create_upload_folder + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + create_upload_folder, + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") client = VDMS_Client(VDMS_HOST, int(VDMS_PORT)) @@ -19,6 +29,7 @@ logflag = os.getenv("LOGFLAG", False) upload_folder = "./uploaded_files/" + def ingest_data_to_vdms(doc_path: DocPath): """Ingest document to VDMS.""" path = doc_path.path @@ -130,7 +141,7 @@ async def ingest_documents( logger.info(f"[ upload ] processing link {doc_id}") # check whether the link file already exists - + save_path = upload_folder + encoded_link + ".txt" content = parse_html([link])[0][0] await save_content_to_local_disk(save_path, content) @@ -150,7 +161,6 @@ async def ingest_documents( raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") - if __name__ == "__main__": create_upload_folder(upload_folder) opea_microservices["opea_service@prepare_doc_vdms"].start() From a8d2657498a0b41ec645b303eec619f743566bca Mon Sep 17 00:00:00 2001 From: srinarayan-srikanthan Date: Tue, 10 Sep 2024 18:53:14 -0700 Subject: [PATCH 28/29] updates path in test script Signed-off-by: srinarayan-srikanthan --- tests/dataprep/test_dataprep_vdms_langchain.sh | 2 +- tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dataprep/test_dataprep_vdms_langchain.sh b/tests/dataprep/test_dataprep_vdms_langchain.sh index 7a7d14fe4..4fe0d0f0a 100644 --- a/tests/dataprep/test_dataprep_vdms_langchain.sh +++ b/tests/dataprep/test_dataprep_vdms_langchain.sh @@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH echo $(pwd) - docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile . + docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/Dockerfile . if [ $? -ne 0 ]; then echo "opea/dataprep-vdms built fail" diff --git a/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh b/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh index e53e528b5..3dc70a7a3 100755 --- a/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh +++ b/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh @@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH echo $(pwd) - docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile . + docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/Dockerfile . if [ $? -ne 0 ]; then echo "opea/dataprep-vdms built fail" From 1fbc34314f98e4646607707cf228524e35824181 Mon Sep 17 00:00:00 2001 From: BaoHuiling Date: Wed, 11 Sep 2024 12:24:14 +0800 Subject: [PATCH 29/29] add name for build Signed-off-by: BaoHuiling --- .github/workflows/docker/compose/dataprep-compose-cd.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml index bd292fc47..80f46ab7e 100644 --- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml +++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml @@ -26,3 +26,4 @@ services: dataprep-vdms: build: dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile + image: ${REGISTRY:-opea}/dataprep-vdms:${TAG:-latest}