From b469976c37e83027e771ed04e6a277a5c283dfd9 Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Mon, 2 Sep 2024 20:09:27 -0700
Subject: [PATCH 01/29] dataprep service

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 comps/dataprep/vdms/README.md                 | 189 ++++++++++++++++++
 comps/dataprep/vdms/langchain/__init__.py     |   2 +
 comps/dataprep/vdms/langchain/config.py       |  33 +++
 .../dataprep/vdms/langchain/docker/Dockerfile |  35 ++++
 .../docker/docker-compose-dataprep-vdms.yaml  |  28 +++
 .../vdms/langchain/prepare_doc_vdms.py        |  81 ++++++++
 .../dataprep/vdms/langchain/requirements.txt  |  39 ++++
 .../vdms/multimodal_langchain/__init__.py     |   2 +
 .../vdms/multimodal_langchain/config.yaml     |  30 +++
 .../multimodal_langchain/docker/Dockerfile    |  40 ++++
 .../docker/docker-compose-dataprep-vdms.yaml  |  28 +++
 .../multimodal_langchain/ingest_videos.py     | 112 +++++++++++
 .../multimodal_langchain/requirements.txt     |  39 ++++
 .../utils/store_embeddings.py                 | 121 +++++++++++
 .../vdms/multimodal_langchain/utils/utils.py  | 119 +++++++++++
 .../vdms/multimodal_langchain/utils/vclip.py  |  58 ++++++
 16 files changed, 956 insertions(+)
 create mode 100644 comps/dataprep/vdms/README.md
 create mode 100644 comps/dataprep/vdms/langchain/__init__.py
 create mode 100644 comps/dataprep/vdms/langchain/config.py
 create mode 100644 comps/dataprep/vdms/langchain/docker/Dockerfile
 create mode 100644 comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
 create mode 100644 comps/dataprep/vdms/langchain/prepare_doc_vdms.py
 create mode 100644 comps/dataprep/vdms/langchain/requirements.txt
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/__init__.py
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/config.yaml
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/requirements.txt
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/utils/utils.py
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/utils/vclip.py

diff --git a/comps/dataprep/vdms/README.md b/comps/dataprep/vdms/README.md
new file mode 100644
index 000000000..617761f02
--- /dev/null
+++ b/comps/dataprep/vdms/README.md
@@ -0,0 +1,189 @@
+# Dataprep Microservice with VDMS
+
+For dataprep microservice, we currently provide one framework: `Langchain`.
+
+<!-- We also provide `Langchain_ray` which uses ray to parallel the data prep for multi-file performance improvement(observed 5x - 15x speedup by processing 1000 files/links.). -->
+
+We organized the folders in the same way, so you can use either framework for dataprep microservice with the following constructions.
+
+# 🚀1. Start Microservice with Python (Option 1)
+
+## 1.1 Install Requirements
+
+- option 1: Install Single-process version (for 1-10 files processing)
+
+```bash
+apt-get update
+apt-get install -y default-jre tesseract-ocr libtesseract-dev poppler-utils
+cd langchain
+pip install -r requirements.txt
+```
+
+<!-- - option 2: Install multi-process version (for >10 files processing)
+
+```bash
+cd langchain_ray; pip install -r requirements_ray.txt
+``` -->
+
+## 1.2 Start VDMS Server
+
+Please refer to this [readme](../../vectorstores/langchain/vdms/README.md).
+
+## 1.3 Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export VDMS_HOST=${host_ip}
+export VDMS_PORT=55555
+export COLLECTION_NAME=${your_collection_name}
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep"
+export PYTHONPATH=${path_to_comps}
+```
+
+## 1.4 Start Document Preparation Microservice for VDMS with Python Script
+
+Start document preparation microservice for VDMS with below command.
+
+- option 1: Start single-process version (for 1-10 files processing)
+
+```bash
+python prepare_doc_vdms.py
+```
+
+<!-- - option 2: Start multi-process version (for >10 files processing)
+
+```bash
+python prepare_doc_redis_on_ray.py
+``` -->
+
+# 🚀2. Start Microservice with Docker (Option 2)
+
+## 2.1 Start VDMS Server
+
+Please refer to this [readme](../../vectorstores/langchain/vdms/README.md).
+
+## 2.2 Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export VDMS_HOST=${host_ip}
+export VDMS_PORT=55555
+export TEI_ENDPOINT=${your_tei_endpoint}
+export COLLECTION_NAME=${your_collection_name}
+export SEARCH_ENGINE="FaissFlat"
+export DISTANCE_STRATEGY="L2"
+export PYTHONPATH=${path_to_comps}
+```
+
+## 2.3 Build Docker Image
+
+- Build docker image with langchain
+
+* option 1: Start single-process version (for 1-10 files processing)
+
+```bash
+cd ../../../
+docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile .
+```
+
+<!-- - option 2: Start multi-process version (for >10 files processing)
+
+```bash
+cd ../../../../
+docker build -t opea/dataprep-on-ray-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain_ray/docker/Dockerfile . -->
+
+## 2.4 Run Docker with CLI
+
+- option 1: Start single-process version (for 1-10 files processing)
+
+```bash
+docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \
+-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_ENDPOINT=$TEI_ENDPOINT \
+-e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \
+opea/dataprep-vdms:latest
+```
+
+<!-- - option 2: Start multi-process version (for >10 files processing)
+
+```bash
+docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \
+-e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+-e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \
+-e TIMEOUT_SECONDS=600 opea/dataprep-on-ray-vdms:latest
+``` -->
+
+# 🚀3. Status Microservice
+
+```bash
+docker container logs -f dataprep-vdms-server
+```
+
+# 🚀4. Consume Microservice
+
+Once document preparation microservice for VDMS is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.
+
+Make sure the file path after `files=@` is correct.
+
+<!-- - Single file upload
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.txt" \
+    http://localhost:6007/v1/dataprep
+```
+
+You can specify chunk_size and chunk_size by the following commands.
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \
+    -F "chunk_size=1500" \
+    -F "chunk_overlap=100" \
+    http://localhost:6007/v1/dataprep
+```
+
+- Multiple file upload
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.txt" \
+    -F "files=@./file2.txt" \
+    -F "files=@./file3.txt" \
+    http://localhost:6007/v1/dataprep
+```
+
+- Links upload (not supported for llama_index now)
+
+```bash
+curl -X POST \
+    -F 'link_list=["https://www.ces.tech/"]' \
+    http://localhost:6007/v1/dataprep
+```
+
+or
+
+```python
+import requests
+import json
+
+proxies = {"http": ""}
+url = "http://localhost:6007/v1/dataprep"
+urls = [
+    "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4"
+]
+payload = {"link_list": json.dumps(urls)}
+
+try:
+    resp = requests.post(url=url, data=payload, proxies=proxies)
+    print(resp.text)
+    resp.raise_for_status()  # Raise an exception for unsuccessful HTTP status codes
+    print("Request successful!")
+except requests.exceptions.RequestException as e:
+    print("An error occurred:", e)
+``` -->
diff --git a/comps/dataprep/vdms/langchain/__init__.py b/comps/dataprep/vdms/langchain/__init__.py
new file mode 100644
index 000000000..4582b4f9a
--- /dev/null
+++ b/comps/dataprep/vdms/langchain/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
diff --git a/comps/dataprep/vdms/langchain/config.py b/comps/dataprep/vdms/langchain/config.py
new file mode 100644
index 000000000..3e3e06a16
--- /dev/null
+++ b/comps/dataprep/vdms/langchain/config.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+
+def getEnv(key, default_value=None):
+    env_value = os.getenv(key, default=default_value)
+    print(f"{key}: {env_value}")
+    return env_value
+
+
+# Embedding model
+EMBED_MODEL = getEnv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
+
+# VDMS configuration
+VDMS_HOST = getEnv("VDMS_HOST", "localhost")
+VDMS_PORT = int(getEnv("VDMS_PORT", 55555))
+COLLECTION_NAME = getEnv("COLLECTION_NAME", "rag-vdms")
+SEARCH_ENGINE = getEnv("SEARCH_ENGINE", "FaissFlat")
+DISTANCE_STRATEGY = getEnv("DISTANCE_STRATEGY", "L2")
+
+# LLM/Embedding endpoints
+TGI_LLM_ENDPOINT = getEnv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+TGI_LLM_ENDPOINT_NO_RAG = getEnv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
+TEI_EMBEDDING_ENDPOINT = getEnv("TEI_ENDPOINT")
+
+# chunk parameters
+CHUNK_SIZE = getEnv("CHUNK_SIZE", 1500)
+CHUNK_OVERLAP = getEnv("CHUNK_OVERLAP", 100)
+
+current_file_path = os.path.abspath(__file__)
+parent_dir = os.path.dirname(current_file_path)
\ No newline at end of file
diff --git a/comps/dataprep/vdms/langchain/docker/Dockerfile b/comps/dataprep/vdms/langchain/docker/Dockerfile
new file mode 100644
index 000000000..606b0a4e1
--- /dev/null
+++ b/comps/dataprep/vdms/langchain/docker/Dockerfile
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+ENV LANG=C.UTF-8
+
+ARG ARCH="cpu"
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libcairo2-dev \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
+    pip install --no-cache-dir -r /home/user/comps/dataprep/vdms/langchain/requirements.txt
+
+ENV PYTHONPATH=/home/user
+
+USER user
+
+WORKDIR /home/user/comps/dataprep/vdms/langchain
+
+ENTRYPOINT ["python", "prepare_doc_vdms.py"]
\ No newline at end of file
diff --git a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
new file mode 100644
index 000000000..edb733c7d
--- /dev/null
+++ b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  vdms-vector-db:
+    image: intellabs/vdms:latest
+    container_name: vdms-vector-db
+    ports:
+      - "55555:55555"
+  dataprep-vdms:
+    image: opea/dataprep-vdms:latest
+    container_name: dataprep-vdms-server
+    ports:
+      - "6007:6007"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      VDMS_HOST: ${VDMS_HOST}
+      VDMS_PORT: ${VDMS_PORT}
+      COLLECTION_NAME: ${COLLECTION_NAME}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
\ No newline at end of file
diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
new file mode 100644
index 000000000..13591dd26
--- /dev/null
+++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
@@ -0,0 +1,81 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
+from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
+from langchain_text_splitters import HTMLHeaderTextSplitter
+
+from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
+from comps.dataprep.utils import document_loader, get_separators, get_tables_result
+
+tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
+client = VDMS_Client(VDMS_HOST, int(VDMS_PORT))
+
+
+@register_microservice(
+    name="opea_service@prepare_doc_vdms",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6007,
+    input_datatype=DocPath,
+    output_datatype=None,
+)
+@opea_telemetry
+def ingest_documents(doc_path: DocPath):
+    """Ingest document to VDMS."""
+    path = doc_path.path
+    print(f"Parsing document {doc_path}.")
+
+    if path.endswith(".html"):
+        headers_to_split_on = [
+            ("h1", "Header 1"),
+            ("h2", "Header 2"),
+            ("h3", "Header 3"),
+        ]
+        text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+    else:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators()
+        )
+
+    content = document_loader(doc_path)
+    chunks = text_splitter.split_text(content)
+    if doc_path.process_table and path.endswith(".pdf"):
+        table_chunks = get_tables_result(path, doc_path.table_strategy)
+        chunks = chunks + table_chunks
+
+    print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+
+    # Create vectorstore
+    if tei_embedding_endpoint:
+        # create embeddings using TEI endpoint service
+        embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
+    else:
+        # create embeddings using local embedding model
+        embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
+
+    # Batch size
+    batch_size = 32
+    num_chunks = len(chunks)
+    for i in range(0, num_chunks, batch_size):
+        batch_chunks = chunks[i : i + batch_size]
+        batch_texts = batch_chunks
+
+        _ = VDMS.from_texts(
+            client=client,
+            embedding=embedder,
+            collection_name=COLLECTION_NAME,
+            distance_strategy=DISTANCE_STRATEGY,
+            engine=SEARCH_ENGINE,
+            texts=batch_texts,
+        )
+        print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@prepare_doc_vdms"].start()
\ No newline at end of file
diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt
new file mode 100644
index 000000000..859dec9f9
--- /dev/null
+++ b/comps/dataprep/vdms/langchain/requirements.txt
@@ -0,0 +1,39 @@
+beautifulsoup4
+cairosvg
+docarray[full]
+docx2txt
+easyocr
+fastapi
+huggingface_hub
+langchain
+langchain-community
+langchain-core
+langchain-text-splitters
+langsmith
+markdown
+numpy
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pandas
+Pillow
+prometheus-fastapi-instrumentator
+pymupdf
+pyspark
+python-bidi==0.4.2
+python-docx
+python-pptx
+sentence_transformers
+shortuuid
+unstructured[all-docs]==0.11.5
+uvicorn
+vdms
+tqdm
+tzlocal
+opencv-python
+tqdm
+tzlocal
+PyYAML
+typing
+decord
+einops
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/__init__.py b/comps/dataprep/vdms/multimodal_langchain/__init__.py
new file mode 100644
index 000000000..4582b4f9a
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
new file mode 100644
index 000000000..b164d263a
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -0,0 +1,30 @@
+# Path to all videos
+videos: video_ingest/videos/
+# Do you want to extract frames of videos (True if not done already, else False)
+generate_frames: True
+# How do you want to generate feature embeddings?
+embeddings:
+  type: 'video' 
+  vclip_model_name: "openai/clip-vit-base-patch32"
+  vclip_num_frm: 64
+  path: 'video_ingest/embeddings'
+# VL-branch config
+vl_branch:
+  cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml
+  model_type: 'llama_v2'
+# Path to store metadata files
+meta_output_dir: video_ingest/video_metadata/
+# Chunk duration defines the interval of time that each embedding will occur
+chunk_duration: 30
+# Clip duration defines the length of the interval in which the embeding will occur
+clip_duration: 10
+# e.g. For every <chunk_duration>, you embed the first <clip_duration>'s frames of that interval
+
+vector_db:
+  choice_of_db: 'vdms' # #Supported databases [vdms]
+  host: 0.0.0.0
+  port: 55555 
+
+
+# LLM path
+model_path: meta-llama/Llama-2-7b-chat-hf
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
new file mode 100644
index 000000000..505448a77
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
@@ -0,0 +1,40 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+ENV LANG=C.UTF-8
+
+ARG ARCH="cpu"
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libcairo2-dev \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
+    pip install --no-cache-dir -r /home/user/comps/dataprep/vdms/multimodal_langchain/requirements.txt
+
+ENV PYTHONPATH=/home/user
+
+USER root
+
+RUN mkdir -p /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files
+
+USER user
+
+WORKDIR /home/user/comps/dataprep/vdms/multimodal_langchain
+
+ENTRYPOINT ["python", "ingest_videos.py"]
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
new file mode 100644
index 000000000..edb733c7d
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  vdms-vector-db:
+    image: intellabs/vdms:latest
+    container_name: vdms-vector-db
+    ports:
+      - "55555:55555"
+  dataprep-vdms:
+    image: opea/dataprep-vdms:latest
+    container_name: dataprep-vdms-server
+    ports:
+      - "6007:6007"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      VDMS_HOST: ${VDMS_HOST}
+      VDMS_PORT: ${VDMS_PORT}
+      COLLECTION_NAME: ${COLLECTION_NAME}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
new file mode 100644
index 000000000..72532387f
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -0,0 +1,112 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import json
+from tqdm import tqdm
+from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
+from utils.utils import read_config, process_all_videos
+from utils import store_embeddings
+from utils.vclip import vCLIP
+
+
+VECTORDB_SERVICE_HOST_IP = os.getenv("VECTORDB_SERVICE_HOST_IP", "0.0.0.0")
+
+def setup_vclip_model(config, device="cpu"):
+    model = vCLIP(config)
+    return model
+
+def read_json(path):
+    with open(path) as f:
+        x = json.load(f)
+    return x
+
+def store_into_vectordb(vs, metadata_file_path, embedding_model, config):
+    GMetadata = read_json(metadata_file_path)
+    global_counter = 0
+
+    total_videos = len(GMetadata.keys())
+    
+    for idx, (video, data) in enumerate(tqdm(GMetadata.items())):
+        image_name_list = []
+        embedding_list = []
+        metadata_list = []
+        ids = []
+        
+        if config['embeddings']['type'] == 'video':
+            data['video'] = video
+            video_name_list = [data["video_path"]]
+            metadata_list = [data]
+            if vs.selected_db == 'vdms':
+                vs.video_db.add_videos(
+                    paths=video_name_list,
+                    metadatas=metadata_list,
+                    start_time=[data['timestamp']],
+                    clip_duration=[data['clip_duration']]
+                )
+            else:
+                print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]")
+
+    # clean up tmp_ folders containing frames (jpeg)
+    for i in os.listdir():
+        if i.startswith("tmp_"):
+            print("removing tmp_*")
+            os.system(f"rm -r tmp_*")
+            print("done.")
+            break
+
+def generate_embeddings(config, embedding_model, vs):
+    print('inside generate')
+    process_all_videos(config)
+    global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json')
+    print(f'global metadata file available at {global_metadata_file_path}')
+    store_into_vectordb(vs, global_metadata_file_path, embedding_model, config)
+      
+@register_microservice(
+    name="opea_service@prepare_doc_vdms",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6007,
+    input_datatype=DocPath,
+    output_datatype=None,
+)
+@opea_telemetry
+def process_videos(doc_path: DocPath):
+    """Ingest videos to VDMS."""
+    path = doc_path.path
+    print(f"Parsing videos {path}.")
+
+    #################
+    #set config_file
+    #################
+    
+    config= config = read_config('./config.yaml')
+    meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']}
+    generate_frames = config['generate_frames']
+    path = config['videos']
+    meta_output_dir = config['meta_output_dir']
+    emb_path = config['embeddings']['path']
+    host = VECTORDB_SERVICE_HOST_IP
+    port = int(config['vector_db']['port'])
+    selected_db = config['vector_db']['choice_of_db']
+    
+    # Creating DB
+    print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.')
+    print('Connecting to {} at {}:{}'.format(selected_db, host, port))
+    #check embedding type
+    if 'video' == 'video':
+        # init meanclip model
+        model = setup_vclip_model(meanclip_cfg, device="cpu")
+        print('init model')
+        vs = store_embeddings.VideoVS(host, port, selected_db, model)
+        print('init vector store')
+    else:
+        print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']")
+        return
+    generate_embeddings(config, model, vs)
+    print('done............success..............')
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@prepare_doc_vdms"].start()
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/requirements.txt b/comps/dataprep/vdms/multimodal_langchain/requirements.txt
new file mode 100644
index 000000000..859dec9f9
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/requirements.txt
@@ -0,0 +1,39 @@
+beautifulsoup4
+cairosvg
+docarray[full]
+docx2txt
+easyocr
+fastapi
+huggingface_hub
+langchain
+langchain-community
+langchain-core
+langchain-text-splitters
+langsmith
+markdown
+numpy
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pandas
+Pillow
+prometheus-fastapi-instrumentator
+pymupdf
+pyspark
+python-bidi==0.4.2
+python-docx
+python-pptx
+sentence_transformers
+shortuuid
+unstructured[all-docs]==0.11.5
+uvicorn
+vdms
+tqdm
+tzlocal
+opencv-python
+tqdm
+tzlocal
+PyYAML
+typing
+decord
+einops
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
new file mode 100644
index 000000000..8c77c9714
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
@@ -0,0 +1,121 @@
+from langchain_community.vectorstores import VDMS
+from langchain_community.vectorstores.vdms import VDMS_Client
+from langchain.pydantic_v1 import BaseModel, root_validator
+from langchain_core.embeddings import Embeddings
+from decord import VideoReader, cpu
+import numpy as np
+from typing import List, Optional, Iterable, Dict, Any
+from PIL import Image
+import torch
+import os
+import time
+import torchvision.transforms as T
+toPIL = T.ToPILImage()
+
+# 'similarity', 'similarity_score_threshold' (needs threshold), 'mmr'
+
+class vCLIPEmbeddings(BaseModel, Embeddings):
+    """MeanCLIP Embeddings model."""
+
+    model: Any
+
+    @root_validator(allow_reuse=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that open_clip and torch libraries are installed."""
+        try:
+            # Use the provided model if present
+            if "model" not in values:
+                raise ValueError("Model must be provided during initialization.")
+
+        except ImportError:
+            raise ImportError(
+                "Please ensure CLIP model is loaded"
+            )
+        return values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        model_device = next(self.model.clip.parameters()).device
+        text_features = self.model.get_text_embeddings(texts)
+
+        return text_features.detach().numpy()
+
+
+    def embed_query(self, text: str) -> List[float]:
+        return self.embed_documents([text])[0]
+
+
+    def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]:
+        # Open images directly as PIL images
+
+        video_features = []
+        for vid_path in sorted(paths):
+            # Encode the video to get the embeddings
+            model_device = next(self.model.parameters()).device
+            # Preprocess the video for the model
+            clip_images = self.load_video_for_vclip(vid_path, num_frm=self.model.num_frm,
+                                                                              max_img_size=224,
+                                                                              start_time=kwargs.get("start_time", None),
+                                                                              clip_duration=kwargs.get("clip_duration", None)
+                                                                              )
+            embeddings_tensor = self.model.get_video_embeddings([clip_images])
+
+            # Convert tensor to list and add to the video_features list
+            embeddings_list = embeddings_tensor.tolist()
+
+            video_features.append(embeddings_list)
+
+        return video_features
+
+
+    def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
+        # Load video with VideoReader
+        vr = VideoReader(vid_path, ctx=cpu(0))
+        fps = vr.get_avg_fps()
+        num_frames = len(vr)
+        start_idx = int(fps*kwargs.get("start_time", [0])[0])
+        end_idx = start_idx+int(fps*kwargs.get("clip_duration", [num_frames])[0])
+
+        frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int) # Uniform sampling
+        clip_images = []
+
+        # read images
+        temp_frms = vr.get_batch(frame_idx.astype(int).tolist())
+        for idx in range(temp_frms.shape[0]):
+            im = temp_frms[idx] # H W C
+            clip_images.append(toPIL(im.permute(2,0,1))) 
+
+        return clip_images
+
+
+class VideoVS:
+    def __init__(self, host, port, selected_db, video_retriever_model, chosen_video_search_type="similarity"):
+        self.host = host
+        self.port = port
+        self.selected_db = selected_db
+        self.chosen_video_search_type = chosen_video_search_type
+        self.constraints = None
+        self.video_collection = 'video-test'
+        self.video_embedder = vCLIPEmbeddings(model=video_retriever_model)
+        self.chosen_video_search_type = chosen_video_search_type
+
+        # initialize_db
+        self.get_db_client()
+        self.init_db()
+
+
+    def get_db_client(self):
+
+        if self.selected_db == 'vdms':
+            print ('Connecting to VDMS db server . . .')
+            self.client = VDMS_Client(host=self.host, port=self.port)
+
+    def init_db(self):
+        print ('Loading db instances')
+        if self.selected_db == 'vdms':
+            self.video_db = VDMS(
+                client=self.client,
+                embedding=self.video_embedder,
+                collection_name=self.video_collection,
+                engine="FaissFlat",
+                distance_strategy="IP"
+            )
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
new file mode 100644
index 000000000..faf5527f7
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
@@ -0,0 +1,119 @@
+import os
+import time as t
+from tqdm import tqdm
+import cv2
+import json
+import datetime
+import random
+from tzlocal import get_localzone
+import yaml
+
+
+def read_config(path):
+    with open(path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+        
+def calculate_intervals(video_path, chunk_duration, clip_duration):
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print("Error: Could not open video.")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_seconds = total_frames / fps
+
+    intervals = []
+
+    chunk_frames = int(chunk_duration * fps)
+    clip_frames = int(clip_duration * fps)
+
+    for start_frame in range(0, total_frames, chunk_frames):
+        end_frame = min(start_frame + clip_frames, total_frames)
+        start_time = start_frame / fps
+        end_time = end_frame / fps
+        intervals.append((start_frame, end_frame, start_time, end_time))
+
+    cap.release()
+    return intervals
+
+def process_all_videos(config):
+    path = config['videos']
+    meta_output_dir = config['meta_output_dir']
+    selected_db = config['vector_db']['choice_of_db']
+    emb_path = config['embeddings']['path']
+    emb_type = config['embeddings']['type']
+    chunk_duration = config['chunk_duration']
+    clip_duration = config['clip_duration']
+
+
+    videos = [file for file in os.listdir(path) if file.endswith('.mp4')] # TODO: increase supported video formats
+
+    # print (f'Total {len(videos)} videos will be processed')
+    metadata = {}
+    
+    for i, each_video in enumerate(tqdm(videos)):
+        metadata[each_video] = {}
+        keyname = each_video
+        video_path = os.path.join(path, each_video)
+        date_time = datetime.datetime.now()  # FIXME CHECK: is this correct? 
+        #date_time = t.ctime(os.stat(video_path).st_ctime)
+        # Get the local timezone of the machine
+        local_timezone = get_localzone()
+        if emb_type == 'video':
+            time_format = "%a %b %d %H:%M:%S %Y"
+            if not isinstance(date_time, datetime.datetime):
+                date_time = datetime.datetime.strptime(date_time, time_format)
+            time = date_time.strftime("%H:%M:%S")
+            hours, minutes, seconds = map(float, time.split(":"))
+            date = date_time.strftime("%Y-%m-%d")
+            year, month, day = map(int, date.split("-"))
+
+            if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration:
+                interval_count = 0
+                metadata.pop(each_video)
+                for start_frame, end_frame, start_time, end_time in calculate_intervals(video_path, chunk_duration, clip_duration):
+                    keyname = os.path.splitext(os.path.basename(video_path))[0]+f"_interval_{interval_count}"
+                    metadata[keyname] = {"timestamp":start_time}
+                    metadata[keyname].update({"date": date, "year": year, "month": month, "day": day, 
+                        "time": time, "hours": hours, "minutes": minutes, "seconds": seconds})
+                    if selected_db == 'vdms':
+                        # Localize the current time to the local timezone of the machine
+                        #Tahani might not need this
+                        current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone)
+
+                        # Convert the localized time to ISO 8601 format with timezone offset
+                        iso_date_time = current_time_local.isoformat()
+                        metadata[keyname]['date_time'] = {"_date": str(iso_date_time)}
+
+                    # Open the video file
+                    cap = cv2.VideoCapture(video_path)
+
+                    if int(cv2.__version__.split('.')[0]) < 3:
+                        fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
+                    else:
+                        fps = cap.get(cv2.CAP_PROP_FPS)
+                
+                    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+                    # get the duration
+                    metadata[keyname].update({
+                            "clip_duration":(min(total_frames,end_frame)-start_frame)/fps,
+                            'fps': fps, 
+                            'total_frames': total_frames, 
+                            #'embedding_path': os.path.join(emb_path, each_video+".pt"),
+                            'video_path': f'{os.path.join(path,each_video)}',
+                        })
+                    cap.release()
+                    interval_count+=1
+        metadata[keyname].update({
+                'fps': fps, 
+                'total_frames': total_frames, 
+                #'embedding_path': os.path.join(emb_path, each_video+".pt"),
+                'video_path': f'{os.path.join(path,each_video)}',
+            })
+    os.makedirs(meta_output_dir, exist_ok=True)
+    metadata_file = os.path.join(meta_output_dir, f"metadata.json")
+    with open(metadata_file, "w") as f:
+        json.dump(metadata, f, indent=4)
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py
new file mode 100644
index 000000000..44d290397
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py
@@ -0,0 +1,58 @@
+import yaml
+import json
+import os, sys
+import argparse
+import torch
+import numpy as np
+from decord import VideoReader, cpu
+from transformers import AutoTokenizer, AutoProcessor, CLIPModel
+import torchvision.transforms as T
+toPIL = T.ToPILImage()
+import torch.nn as nn
+from einops import rearrange
+
+class vCLIP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.num_frm = cfg["num_frm"]
+        self.model_name = cfg["model_name"]
+
+        self.clip = CLIPModel.from_pretrained(self.model_name)
+        self.processor = AutoProcessor.from_pretrained(self.model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+
+    def get_text_embeddings(self, texts):
+        """
+        input is list of texts
+        """
+        text_inputs = self.tokenizer(texts, padding=True, return_tensors="pt")
+        text_features = self.clip.get_text_features(**text_inputs)
+        return text_features
+
+
+    def get_image_embeddings(self, images):
+        """
+        input is list of images
+        """
+        image_inputs = self.processor(images=images, return_tensors="pt")
+        image_features = self.clip.get_image_features(**image_inputs)
+        return image_features 
+
+
+    def get_video_embeddings(self, frames_batch):
+        """
+        input is list of list of frames in video
+        """
+        self.batch_size = len(frames_batch)
+        vid_embs = []
+        for frames in frames_batch:
+            frame_embeddings = self.get_image_embeddings(frames)
+            frame_embeddings = rearrange(frame_embeddings, "(b n) d -> b n d", b=len(frames_batch))
+            # Normalize, mean aggregate and return normalized video_embeddings
+            frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True) 
+            video_embeddings = frame_embeddings.mean(dim=1)
+            video_embeddings = video_embeddings / video_embeddings.norm(dim=-1, keepdim=True)
+            vid_embs.append(video_embeddings)
+        return torch.cat(vid_embs, dim=0)
\ No newline at end of file

From e87b159c402461fbd37e1524922574da88b780a6 Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 3 Sep 2024 14:26:37 -0700
Subject: [PATCH 02/29] dataprep updates

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 comps/dataprep/vdms/multimodal_langchain/config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
index b164d263a..ba7b33fb6 100644
--- a/comps/dataprep/vdms/multimodal_langchain/config.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -1,5 +1,5 @@
 # Path to all videos
-videos: video_ingest/videos/
+videos: uploaded_files/videos/
 # Do you want to extract frames of videos (True if not done already, else False)
 generate_frames: True
 # How do you want to generate feature embeddings?
@@ -7,13 +7,13 @@ embeddings:
   type: 'video' 
   vclip_model_name: "openai/clip-vit-base-patch32"
   vclip_num_frm: 64
-  path: 'video_ingest/embeddings'
+  path: 'uploaded_files/embeddings'
 # VL-branch config
 vl_branch:
   cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml
   model_type: 'llama_v2'
 # Path to store metadata files
-meta_output_dir: video_ingest/video_metadata/
+meta_output_dir: uploaded_files/video_metadata/
 # Chunk duration defines the interval of time that each embedding will occur
 chunk_duration: 30
 # Clip duration defines the length of the interval in which the embeding will occur

From dc3b5b77424cdd1ffa4a3dbc56e149c3a5c7c607 Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 3 Sep 2024 22:26:10 -0700
Subject: [PATCH 03/29] rearranged dirs

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 .../vdms/multimodal_langchain/config.yaml     |  2 +-
 .../multimodal_langchain/docker/Dockerfile    |  2 +-
 .../multimodal_langchain/ingest_videos.py     | 49 +++++++++++++------
 .../utils/store_embeddings.py                 |  2 +
 4 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
index ba7b33fb6..34f7cffeb 100644
--- a/comps/dataprep/vdms/multimodal_langchain/config.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -1,5 +1,5 @@
 # Path to all videos
-videos: uploaded_files/videos/
+videos: uploaded_files/
 # Do you want to extract frames of videos (True if not done already, else False)
 generate_frames: True
 # How do you want to generate feature embeddings?
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
index 505448a77..a0de62cba 100644
--- a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
+++ b/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
@@ -31,7 +31,7 @@ ENV PYTHONPATH=/home/user
 
 USER root
 
-RUN mkdir -p /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files
+RUN mkdir -p /home/user/comps/dataprep/vdms/multimodal_langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/multimodal_langchain
 
 USER user
 
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index 72532387f..0d3018537 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -5,13 +5,16 @@
 import os
 import json
 from tqdm import tqdm
-from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
+from comps import opea_microservices, register_microservice
 from utils.utils import read_config, process_all_videos
 from utils import store_embeddings
 from utils.vclip import vCLIP
+from fastapi import File, HTTPException, UploadFile
+import uuid
+from typing import Any, Dict, Iterable, List, Optional, Type, Union
+import shutil
 
-
-VECTORDB_SERVICE_HOST_IP = os.getenv("VECTORDB_SERVICE_HOST_IP", "0.0.0.0")
+VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0")
 
 def setup_vclip_model(config, device="cpu"):
     model = vCLIP(config)
@@ -55,6 +58,10 @@ def store_into_vectordb(vs, metadata_file_path, embedding_model, config):
             os.system(f"rm -r tmp_*")
             print("done.")
             break
+            
+def generate_video_id():
+    """Generates a unique identifier for a video file."""
+    return str(uuid.uuid4())        
 
 def generate_embeddings(config, embedding_model, vs):
     print('inside generate')
@@ -67,19 +74,11 @@ def generate_embeddings(config, embedding_model, vs):
     name="opea_service@prepare_doc_vdms",
     endpoint="/v1/dataprep",
     host="0.0.0.0",
-    port=6007,
-    input_datatype=DocPath,
-    output_datatype=None,
+    port=6007
 )
-@opea_telemetry
-def process_videos(doc_path: DocPath):
-    """Ingest videos to VDMS."""
-    path = doc_path.path
-    print(f"Parsing videos {path}.")
 
-    #################
-    #set config_file
-    #################
+def process_videos(files: List[UploadFile] = File(None)):
+    """Ingest videos to VDMS."""
     
     config= config = read_config('./config.yaml')
     meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']}
@@ -90,6 +89,28 @@ def process_videos(doc_path: DocPath):
     host = VECTORDB_SERVICE_HOST_IP
     port = int(config['vector_db']['port'])
     selected_db = config['vector_db']['choice_of_db']
+    print(f"Parsing videos {path}.")
+    
+    #Saving videos
+    if files:
+        video_files = []
+        for file in files:
+            if os.path.splitext(file.filename)[1] == ".mp4":
+                video_files.append(file)
+            else:
+                raise HTTPException(
+                    status_code=400, detail=f"File {file.filename} is not an mp4 file. Please upload mp4 files only."
+                )
+
+        for video_file in video_files:
+            video_id = generate_video_id()
+            video_name = os.path.splitext(video_file.filename)[0]
+            video_file_name = f"{video_name}_{video_id}.mp4"
+            video_dir_name = os.path.splitext(video_file_name)[0]
+            # Save video file in upload_directory
+            with open(os.path.join(path, video_file_name), "wb") as f:
+                shutil.copyfileobj(video_file.file, f)
+
     
     # Creating DB
     print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.')
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
index 8c77c9714..6e5d849d7 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
@@ -69,6 +69,8 @@ def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]:
 
     def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
         # Load video with VideoReader
+        import decord
+        decord.bridge.set_bridge('torch')
         vr = VideoReader(vid_path, ctx=cpu(0))
         fps = vr.get_avg_fps()
         num_frames = len(vr)

From 4045cb8549d9fe362a166c8439e1564c55c6f54d Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 3 Sep 2024 22:38:51 -0700
Subject: [PATCH 04/29] added readme

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 .../vdms/multimodal_langchain/README.md       | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 comps/dataprep/vdms/multimodal_langchain/README.md

diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md
new file mode 100644
index 000000000..c3579ceb5
--- /dev/null
+++ b/comps/dataprep/vdms/multimodal_langchain/README.md
@@ -0,0 +1,114 @@
+# Multimodal Dataprep Microservice with VDMS
+
+For dataprep microservice, we currently provide one framework: `Langchain`.
+
+# 🚀1. Start Microservice with Python (Option 1)
+
+## 1.1 Install Requirements
+
+- option 1: Install Single-process version (for 1-10 files processing)
+
+```bash
+apt-get update
+apt-get install -y default-jre tesseract-ocr libtesseract-dev poppler-utils
+pip install -r requirements.txt
+```
+
+## 1.2 Start VDMS Server
+
+```bash
+docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest
+```
+
+## 1.3 Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export host_ip=$(hostname -I | awk '{print $1}')
+export VDMS_HOST=${host_ip}
+export VDMS_PORT=55555
+export your_hf_api_token="{your_hf_token}"
+export PYTHONPATH=${path_to_comps}
+```
+
+## 1.4 Start Data Preparation Microservice for VDMS with Python Script
+
+Start document preparation microservice for VDMS with below command.
+
+
+```bash
+python ingest_videos.py
+```
+
+# 🚀2. Start Microservice with Docker (Option 2)
+
+## 2.1 Start VDMS Server
+
+
+```bash
+docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest
+```
+
+
+## 2.1 Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export host_ip=$(hostname -I | awk '{print $1}')
+export VDMS_HOST=${host_ip}
+export VDMS_PORT=55555
+export your_hf_api_token="{your_hf_token}"
+```
+
+## 2.3 Build Docker Image
+
+- Build docker image 
+```bash
+cd ../../../
+ docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile .
+
+```
+
+
+## 2.4 Run Docker Compose
+
+
+```bash
+docker compose -f comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml up -d
+```
+
+
+# 🚀3. Status Microservice
+
+```bash
+docker container logs -f dataprep-vdms-server
+```
+
+# 🚀4. Consume Microservice
+
+Once data preparation microservice for VDMS is started, user can use below command to invoke the microservice to convert the videos to embedding and save to the database.
+
+Make sure the file path after `files=@` is correct.
+
+
+- Single file upload
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.mp4" \
+    http://localhost:6007/v1/dataprep
+```
+- Multiple file upload
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.mp4" \
+    -F "files=@./file2.mp4" \
+    -F "files=@./file3.mp4" \
+    http://localhost:6007/v1/dataprep
+```
+

From d4c9441a54300b8acb7e2599ed9bfd80e383dc5f Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 3 Sep 2024 22:44:32 -0700
Subject: [PATCH 05/29] removed checks

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 comps/dataprep/vdms/multimodal_langchain/ingest_videos.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index 0d3018537..8a5ff982d 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -56,7 +56,6 @@ def store_into_vectordb(vs, metadata_file_path, embedding_model, config):
         if i.startswith("tmp_"):
             print("removing tmp_*")
             os.system(f"rm -r tmp_*")
-            print("done.")
             break
             
 def generate_video_id():
@@ -64,7 +63,6 @@ def generate_video_id():
     return str(uuid.uuid4())        
 
 def generate_embeddings(config, embedding_model, vs):
-    print('inside generate')
     process_all_videos(config)
     global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json')
     print(f'global metadata file available at {global_metadata_file_path}')
@@ -119,14 +117,13 @@ def process_videos(files: List[UploadFile] = File(None)):
     if 'video' == 'video':
         # init meanclip model
         model = setup_vclip_model(meanclip_cfg, device="cpu")
-        print('init model')
         vs = store_embeddings.VideoVS(host, port, selected_db, model)
-        print('init vector store')
+
     else:
         print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']")
         return
     generate_embeddings(config, model, vs)
-    print('done............success..............')
+    
 
 
 if __name__ == "__main__":

From 40117cb208834fbfefb01fb4ede2c2eb049428f2 Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Wed, 4 Sep 2024 14:19:33 -0700
Subject: [PATCH 06/29] added features

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 .../vdms/multimodal_langchain/README.md       |  2 +
 .../vdms/multimodal_langchain/config.yaml     |  3 --
 .../docker/docker-compose-dataprep-vdms.yaml  |  2 +-
 .../multimodal_langchain/ingest_videos.py     | 43 ++++++++++++++++---
 .../utils/store_embeddings.py                 |  4 +-
 5 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md
index c3579ceb5..6cd5828e5 100644
--- a/comps/dataprep/vdms/multimodal_langchain/README.md
+++ b/comps/dataprep/vdms/multimodal_langchain/README.md
@@ -28,6 +28,7 @@ export https_proxy=${your_http_proxy}
 export host_ip=$(hostname -I | awk '{print $1}')
 export VDMS_HOST=${host_ip}
 export VDMS_PORT=55555
+export INDEX_NAME="rag-vdms"
 export your_hf_api_token="{your_hf_token}"
 export PYTHONPATH=${path_to_comps}
 ```
@@ -59,6 +60,7 @@ export https_proxy=${your_http_proxy}
 export host_ip=$(hostname -I | awk '{print $1}')
 export VDMS_HOST=${host_ip}
 export VDMS_PORT=55555
+export INDEX_NAME="rag-vdms"
 export your_hf_api_token="{your_hf_token}"
 ```
 
diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
index 34f7cffeb..209110920 100644
--- a/comps/dataprep/vdms/multimodal_langchain/config.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -22,9 +22,6 @@ clip_duration: 10
 
 vector_db:
   choice_of_db: 'vdms' # #Supported databases [vdms]
-  host: 0.0.0.0
-  port: 55555 
-
 
 # LLM path
 model_path: meta-llama/Llama-2-7b-chat-hf
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
index edb733c7d..a08aa1877 100644
--- a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
@@ -20,7 +20,7 @@ services:
       https_proxy: ${https_proxy}
       VDMS_HOST: ${VDMS_HOST}
       VDMS_PORT: ${VDMS_PORT}
-      COLLECTION_NAME: ${COLLECTION_NAME}
+      INDEX_NAME: ${INDEX_NAME}
     restart: unless-stopped
 
 networks:
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index 8a5ff982d..d5bb2c321 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -10,11 +10,15 @@
 from utils import store_embeddings
 from utils.vclip import vCLIP
 from fastapi import File, HTTPException, UploadFile
+from fastapi.responses import FileResponse
 import uuid
 from typing import Any, Dict, Iterable, List, Optional, Type, Union
 import shutil
+from pathlib import Path
 
 VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0")
+VECTORDB_SERVICE_PORT = os.getenv("VDMS_PORT", 55555)
+collection_name = os.getenv("INDEX_NAME", "rag-vdms")
 
 def setup_vclip_model(config, device="cpu"):
     model = vCLIP(config)
@@ -69,23 +73,23 @@ def generate_embeddings(config, embedding_model, vs):
     store_into_vectordb(vs, global_metadata_file_path, embedding_model, config)
       
 @register_microservice(
-    name="opea_service@prepare_doc_vdms",
+    name="opea_service@prepare_videodoc_vdms",
     endpoint="/v1/dataprep",
     host="0.0.0.0",
     port=6007
 )
 
-def process_videos(files: List[UploadFile] = File(None)):
+async def process_videos(files: List[UploadFile] = File(None)):
     """Ingest videos to VDMS."""
     
-    config= config = read_config('./config.yaml')
+    config = read_config('./config.yaml')
     meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']}
     generate_frames = config['generate_frames']
     path = config['videos']
     meta_output_dir = config['meta_output_dir']
     emb_path = config['embeddings']['path']
     host = VECTORDB_SERVICE_HOST_IP
-    port = int(config['vector_db']['port'])
+    port = int(VECTORDB_SERVICE_PORT)
     selected_db = config['vector_db']['choice_of_db']
     print(f"Parsing videos {path}.")
     
@@ -117,14 +121,41 @@ def process_videos(files: List[UploadFile] = File(None)):
     if 'video' == 'video':
         # init meanclip model
         model = setup_vclip_model(meanclip_cfg, device="cpu")
-        vs = store_embeddings.VideoVS(host, port, selected_db, model)
+        vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name)
 
     else:
         print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']")
         return
     generate_embeddings(config, model, vs)
     
+@register_microservice(
+    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007
+)
+async def rag_get_file_structure():
+    """Returns list of names of uploaded videos saved on the server."""
+    config = read_config('./config.yaml')
+    if not Path( config['videos']).exists():
+        print("No file uploaded, return empty list.")
+        return []
+
+    uploaded_videos = os.listdir(config['videos'])
+    mp4_files = [file for file in uploaded_videos if file.endswith(".mp4")]
+    return mp4_files
+
+@register_microservice(
+    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007
+)
+async def rag_get_file(filename: str):
+    """Download the file from remote."""
+    
+    config = read_config('./config.yaml')
+    UPLOAD_DIR=config['videos']
+    file_path = os.path.join(UPLOAD_DIR, filename)
+    if os.path.exists(file_path):
+        return FileResponse(path=file_path, filename=filename)
+    else:
+        return {"error": "File not found"}
 
 
 if __name__ == "__main__":
-    opea_microservices["opea_service@prepare_doc_vdms"].start()
\ No newline at end of file
+    opea_microservices["opea_service@prepare_videodoc_vdms"].start()
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
index 6e5d849d7..3487c675b 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
@@ -90,13 +90,13 @@ def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
 
 
 class VideoVS:
-    def __init__(self, host, port, selected_db, video_retriever_model, chosen_video_search_type="similarity"):
+    def __init__(self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity"):
         self.host = host
         self.port = port
         self.selected_db = selected_db
         self.chosen_video_search_type = chosen_video_search_type
         self.constraints = None
-        self.video_collection = 'video-test'
+        self.video_collection = collection_name
         self.video_embedder = vCLIPEmbeddings(model=video_retriever_model)
         self.chosen_video_search_type = chosen_video_search_type
 

From f9d1e2b3979532e339985b88e7e52baae98f856e Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Wed, 4 Sep 2024 21:49:25 -0700
Subject: [PATCH 07/29] added get method

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 comps/dataprep/vdms/multimodal_langchain/README.md        | 4 ++++
 comps/dataprep/vdms/multimodal_langchain/ingest_videos.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md
index 6cd5828e5..44c54df97 100644
--- a/comps/dataprep/vdms/multimodal_langchain/README.md
+++ b/comps/dataprep/vdms/multimodal_langchain/README.md
@@ -113,4 +113,8 @@ curl -X POST \
     -F "files=@./file3.mp4" \
     http://localhost:6007/v1/dataprep
 ```
+- List of uploaded files
 
+```bash
+curl -X POST http://localhost:6007/v1/dataprep/get_videos
+```
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index d5bb2c321..fa1467eab 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -129,7 +129,7 @@ async def process_videos(files: List[UploadFile] = File(None)):
     generate_embeddings(config, model, vs)
     
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007
+    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007,methods=["GET"]
 )
 async def rag_get_file_structure():
     """Returns list of names of uploaded videos saved on the server."""
@@ -143,7 +143,7 @@ async def rag_get_file_structure():
     return mp4_files
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007
+    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007,methods=["GET"]
 )
 async def rag_get_file(filename: str):
     """Download the file from remote."""

From ea8e83eac4a0d41026229d63ffbcb461db3b1ecf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 04:56:10 +0000
Subject: [PATCH 08/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/dataprep/vdms/langchain/__init__.py     |   2 +-
 comps/dataprep/vdms/langchain/config.py       |   2 +-
 .../docker/docker-compose-dataprep-vdms.yaml  |   2 +-
 .../vdms/langchain/prepare_doc_vdms.py        |   3 +-
 .../dataprep/vdms/langchain/requirements.txt  |  16 +--
 .../vdms/multimodal_langchain/README.md       |  12 +-
 .../vdms/multimodal_langchain/__init__.py     |   2 +-
 .../vdms/multimodal_langchain/config.yaml     |  15 +-
 .../docker/docker-compose-dataprep-vdms.yaml  |   2 +-
 .../multimodal_langchain/ingest_videos.py     | 129 ++++++++++--------
 .../multimodal_langchain/requirements.txt     |  16 +--
 .../utils/store_embeddings.py                 |  71 +++++-----
 .../vdms/multimodal_langchain/utils/utils.py  | 109 +++++++++------
 .../vdms/multimodal_langchain/utils/vclip.py  |  40 +++---
 14 files changed, 228 insertions(+), 193 deletions(-)

diff --git a/comps/dataprep/vdms/langchain/__init__.py b/comps/dataprep/vdms/langchain/__init__.py
index 4582b4f9a..916f3a44b 100644
--- a/comps/dataprep/vdms/langchain/__init__.py
+++ b/comps/dataprep/vdms/langchain/__init__.py
@@ -1,2 +1,2 @@
 # Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/dataprep/vdms/langchain/config.py b/comps/dataprep/vdms/langchain/config.py
index 3e3e06a16..e12ba1502 100644
--- a/comps/dataprep/vdms/langchain/config.py
+++ b/comps/dataprep/vdms/langchain/config.py
@@ -30,4 +30,4 @@ def getEnv(key, default_value=None):
 CHUNK_OVERLAP = getEnv("CHUNK_OVERLAP", 100)
 
 current_file_path = os.path.abspath(__file__)
-parent_dir = os.path.dirname(current_file_path)
\ No newline at end of file
+parent_dir = os.path.dirname(current_file_path)
diff --git a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
index edb733c7d..46880119e 100644
--- a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
+++ b/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
@@ -25,4 +25,4 @@ services:
 
 networks:
   default:
-    driver: bridge
\ No newline at end of file
+    driver: bridge
diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
index 13591dd26..e6f7d0072 100644
--- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
+++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
@@ -1,4 +1,3 @@
-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
@@ -78,4 +77,4 @@ def ingest_documents(doc_path: DocPath):
 
 
 if __name__ == "__main__":
-    opea_microservices["opea_service@prepare_doc_vdms"].start()
\ No newline at end of file
+    opea_microservices["opea_service@prepare_doc_vdms"].start()
diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt
index 859dec9f9..f6044266c 100644
--- a/comps/dataprep/vdms/langchain/requirements.txt
+++ b/comps/dataprep/vdms/langchain/requirements.txt
@@ -1,8 +1,10 @@
 beautifulsoup4
 cairosvg
+decord
 docarray[full]
 docx2txt
 easyocr
+einops
 fastapi
 huggingface_hub
 langchain
@@ -12,6 +14,7 @@ langchain-text-splitters
 langsmith
 markdown
 numpy
+opencv-python
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
@@ -23,17 +26,12 @@ pyspark
 python-bidi==0.4.2
 python-docx
 python-pptx
+PyYAML
 sentence_transformers
 shortuuid
+tqdm
+typing
+tzlocal
 unstructured[all-docs]==0.11.5
 uvicorn
 vdms
-tqdm
-tzlocal
-opencv-python
-tqdm
-tzlocal
-PyYAML
-typing
-decord
-einops
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md
index 44c54df97..54e878b65 100644
--- a/comps/dataprep/vdms/multimodal_langchain/README.md
+++ b/comps/dataprep/vdms/multimodal_langchain/README.md
@@ -37,7 +37,6 @@ export PYTHONPATH=${path_to_comps}
 
 Start document preparation microservice for VDMS with below command.
 
-
 ```bash
 python ingest_videos.py
 ```
@@ -46,12 +45,10 @@ python ingest_videos.py
 
 ## 2.1 Start VDMS Server
 
-
 ```bash
 docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest
 ```
 
-
 ## 2.1 Setup Environment Variables
 
 ```bash
@@ -66,22 +63,20 @@ export your_hf_api_token="{your_hf_token}"
 
 ## 2.3 Build Docker Image
 
-- Build docker image 
+- Build docker image
+
 ```bash
 cd ../../../
  docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile .
 
 ```
 
-
 ## 2.4 Run Docker Compose
 
-
 ```bash
 docker compose -f comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml up -d
 ```
 
-
 # 🚀3. Status Microservice
 
 ```bash
@@ -94,7 +89,6 @@ Once data preparation microservice for VDMS is started, user can use below comma
 
 Make sure the file path after `files=@` is correct.
 
-
 - Single file upload
 
 ```bash
@@ -103,6 +97,7 @@ curl -X POST \
     -F "files=@./file1.mp4" \
     http://localhost:6007/v1/dataprep
 ```
+
 - Multiple file upload
 
 ```bash
@@ -113,6 +108,7 @@ curl -X POST \
     -F "files=@./file3.mp4" \
     http://localhost:6007/v1/dataprep
 ```
+
 - List of uploaded files
 
 ```bash
diff --git a/comps/dataprep/vdms/multimodal_langchain/__init__.py b/comps/dataprep/vdms/multimodal_langchain/__init__.py
index 4582b4f9a..916f3a44b 100644
--- a/comps/dataprep/vdms/multimodal_langchain/__init__.py
+++ b/comps/dataprep/vdms/multimodal_langchain/__init__.py
@@ -1,2 +1,2 @@
 # Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
index 209110920..40c327615 100644
--- a/comps/dataprep/vdms/multimodal_langchain/config.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -1,27 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # Path to all videos
 videos: uploaded_files/
 # Do you want to extract frames of videos (True if not done already, else False)
 generate_frames: True
 # How do you want to generate feature embeddings?
 embeddings:
-  type: 'video' 
+  type: "video"
   vclip_model_name: "openai/clip-vit-base-patch32"
   vclip_num_frm: 64
-  path: 'uploaded_files/embeddings'
+  path: "uploaded_files/embeddings"
 # VL-branch config
 vl_branch:
   cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml
-  model_type: 'llama_v2'
+  model_type: "llama_v2"
 # Path to store metadata files
 meta_output_dir: uploaded_files/video_metadata/
 # Chunk duration defines the interval of time that each embedding will occur
 chunk_duration: 30
-# Clip duration defines the length of the interval in which the embeding will occur
+# Clip duration defines the length of the interval in which the embedding will occur
 clip_duration: 10
 # e.g. For every <chunk_duration>, you embed the first <clip_duration>'s frames of that interval
 
 vector_db:
-  choice_of_db: 'vdms' # #Supported databases [vdms]
+  choice_of_db: "vdms" # #Supported databases [vdms]
 
 # LLM path
-model_path: meta-llama/Llama-2-7b-chat-hf
\ No newline at end of file
+model_path: meta-llama/Llama-2-7b-chat-hf
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
index a08aa1877..785dc6408 100644
--- a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
@@ -25,4 +25,4 @@ services:
 
 networks:
   default:
-    driver: bridge
\ No newline at end of file
+    driver: bridge
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index fa1467eab..eeadcb1e1 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -1,56 +1,60 @@
-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import json
+import os
+import shutil
+import uuid
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Type, Union
+
+from fastapi import File, HTTPException, UploadFile
+from fastapi.responses import FileResponse
 from tqdm import tqdm
-from comps import opea_microservices, register_microservice
-from utils.utils import read_config, process_all_videos
 from utils import store_embeddings
+from utils.utils import process_all_videos, read_config
 from utils.vclip import vCLIP
-from fastapi import File, HTTPException, UploadFile
-from fastapi.responses import FileResponse
-import uuid
-from typing import Any, Dict, Iterable, List, Optional, Type, Union
-import shutil
-from pathlib import Path
+
+from comps import opea_microservices, register_microservice
 
 VECTORDB_SERVICE_HOST_IP = os.getenv("VDMS_HOST", "0.0.0.0")
 VECTORDB_SERVICE_PORT = os.getenv("VDMS_PORT", 55555)
 collection_name = os.getenv("INDEX_NAME", "rag-vdms")
 
+
 def setup_vclip_model(config, device="cpu"):
     model = vCLIP(config)
     return model
 
+
 def read_json(path):
     with open(path) as f:
         x = json.load(f)
     return x
 
+
 def store_into_vectordb(vs, metadata_file_path, embedding_model, config):
     GMetadata = read_json(metadata_file_path)
     global_counter = 0
 
     total_videos = len(GMetadata.keys())
-    
+
     for idx, (video, data) in enumerate(tqdm(GMetadata.items())):
         image_name_list = []
         embedding_list = []
         metadata_list = []
         ids = []
-        
-        if config['embeddings']['type'] == 'video':
-            data['video'] = video
+
+        if config["embeddings"]["type"] == "video":
+            data["video"] = video
             video_name_list = [data["video_path"]]
             metadata_list = [data]
-            if vs.selected_db == 'vdms':
+            if vs.selected_db == "vdms":
                 vs.video_db.add_videos(
                     paths=video_name_list,
                     metadatas=metadata_list,
-                    start_time=[data['timestamp']],
-                    clip_duration=[data['clip_duration']]
+                    start_time=[data["timestamp"]],
+                    clip_duration=[data["clip_duration"]],
                 )
             else:
                 print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]")
@@ -59,41 +63,41 @@ def store_into_vectordb(vs, metadata_file_path, embedding_model, config):
     for i in os.listdir():
         if i.startswith("tmp_"):
             print("removing tmp_*")
-            os.system(f"rm -r tmp_*")
+            os.system("rm -r tmp_*")
             break
-            
+
+
 def generate_video_id():
     """Generates a unique identifier for a video file."""
-    return str(uuid.uuid4())        
+    return str(uuid.uuid4())
+
 
 def generate_embeddings(config, embedding_model, vs):
     process_all_videos(config)
-    global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json')
-    print(f'global metadata file available at {global_metadata_file_path}')
+    global_metadata_file_path = os.path.join(config["meta_output_dir"], "metadata.json")
+    print(f"global metadata file available at {global_metadata_file_path}")
     store_into_vectordb(vs, global_metadata_file_path, embedding_model, config)
-      
-@register_microservice(
-    name="opea_service@prepare_videodoc_vdms",
-    endpoint="/v1/dataprep",
-    host="0.0.0.0",
-    port=6007
-)
 
+
+@register_microservice(name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007)
 async def process_videos(files: List[UploadFile] = File(None)):
     """Ingest videos to VDMS."""
-    
-    config = read_config('./config.yaml')
-    meanclip_cfg = {"model_name": config['embeddings']['vclip_model_name'], "num_frm": config['embeddings']['vclip_num_frm']}
-    generate_frames = config['generate_frames']
-    path = config['videos']
-    meta_output_dir = config['meta_output_dir']
-    emb_path = config['embeddings']['path']
+
+    config = read_config("./config.yaml")
+    meanclip_cfg = {
+        "model_name": config["embeddings"]["vclip_model_name"],
+        "num_frm": config["embeddings"]["vclip_num_frm"],
+    }
+    generate_frames = config["generate_frames"]
+    path = config["videos"]
+    meta_output_dir = config["meta_output_dir"]
+    emb_path = config["embeddings"]["path"]
     host = VECTORDB_SERVICE_HOST_IP
     port = int(VECTORDB_SERVICE_PORT)
-    selected_db = config['vector_db']['choice_of_db']
+    selected_db = config["vector_db"]["choice_of_db"]
     print(f"Parsing videos {path}.")
-    
-    #Saving videos
+
+    # Saving videos
     if files:
         video_files = []
         for file in files:
@@ -113,43 +117,56 @@ async def process_videos(files: List[UploadFile] = File(None)):
             with open(os.path.join(path, video_file_name), "wb") as f:
                 shutil.copyfileobj(video_file.file, f)
 
-    
     # Creating DB
-    print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.')
-    print('Connecting to {} at {}:{}'.format(selected_db, host, port))
-    #check embedding type
-    if 'video' == 'video':
+    print(
+        "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time."
+    )
+    print("Connecting to {} at {}:{}".format(selected_db, host, port))
+    # check embedding type
+    if "video" == "video":
         # init meanclip model
         model = setup_vclip_model(meanclip_cfg, device="cpu")
-        vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name)
+        vs = store_embeddings.VideoVS(host, port, selected_db, model, collection_name)
 
     else:
-        print(f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in [\'video\', \'frame\']")
+        print(
+            f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in ['video', 'frame']"
+        )
         return
     generate_embeddings(config, model, vs)
-    
+
+
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007,methods=["GET"]
+    name="opea_service@prepare_videodoc_vdms",
+    endpoint="/v1/dataprep/get_videos",
+    host="0.0.0.0",
+    port=6007,
+    methods=["GET"],
 )
 async def rag_get_file_structure():
     """Returns list of names of uploaded videos saved on the server."""
-    config = read_config('./config.yaml')
-    if not Path( config['videos']).exists():
+    config = read_config("./config.yaml")
+    if not Path(config["videos"]).exists():
         print("No file uploaded, return empty list.")
         return []
 
-    uploaded_videos = os.listdir(config['videos'])
+    uploaded_videos = os.listdir(config["videos"])
     mp4_files = [file for file in uploaded_videos if file.endswith(".mp4")]
     return mp4_files
 
+
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007,methods=["GET"]
+    name="opea_service@prepare_videodoc_vdms",
+    endpoint="/v1/dataprep/get_file/{filename}",
+    host="0.0.0.0",
+    port=6007,
+    methods=["GET"],
 )
 async def rag_get_file(filename: str):
     """Download the file from remote."""
-    
-    config = read_config('./config.yaml')
-    UPLOAD_DIR=config['videos']
+
+    config = read_config("./config.yaml")
+    UPLOAD_DIR = config["videos"]
     file_path = os.path.join(UPLOAD_DIR, filename)
     if os.path.exists(file_path):
         return FileResponse(path=file_path, filename=filename)
@@ -158,4 +175,4 @@ async def rag_get_file(filename: str):
 
 
 if __name__ == "__main__":
-    opea_microservices["opea_service@prepare_videodoc_vdms"].start()
\ No newline at end of file
+    opea_microservices["opea_service@prepare_videodoc_vdms"].start()
diff --git a/comps/dataprep/vdms/multimodal_langchain/requirements.txt b/comps/dataprep/vdms/multimodal_langchain/requirements.txt
index 859dec9f9..f6044266c 100644
--- a/comps/dataprep/vdms/multimodal_langchain/requirements.txt
+++ b/comps/dataprep/vdms/multimodal_langchain/requirements.txt
@@ -1,8 +1,10 @@
 beautifulsoup4
 cairosvg
+decord
 docarray[full]
 docx2txt
 easyocr
+einops
 fastapi
 huggingface_hub
 langchain
@@ -12,6 +14,7 @@ langchain-text-splitters
 langsmith
 markdown
 numpy
+opencv-python
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
@@ -23,17 +26,12 @@ pyspark
 python-bidi==0.4.2
 python-docx
 python-pptx
+PyYAML
 sentence_transformers
 shortuuid
+tqdm
+typing
+tzlocal
 unstructured[all-docs]==0.11.5
 uvicorn
 vdms
-tqdm
-tzlocal
-opencv-python
-tqdm
-tzlocal
-PyYAML
-typing
-decord
-einops
\ No newline at end of file
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
index 3487c675b..6468e5195 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
@@ -1,19 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+from typing import Any, Dict, Iterable, List, Optional
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from langchain.pydantic_v1 import BaseModel, root_validator
 from langchain_community.vectorstores import VDMS
 from langchain_community.vectorstores.vdms import VDMS_Client
-from langchain.pydantic_v1 import BaseModel, root_validator
 from langchain_core.embeddings import Embeddings
-from decord import VideoReader, cpu
-import numpy as np
-from typing import List, Optional, Iterable, Dict, Any
 from PIL import Image
-import torch
-import os
-import time
-import torchvision.transforms as T
+
 toPIL = T.ToPILImage()
 
 # 'similarity', 'similarity_score_threshold' (needs threshold), 'mmr'
 
+
 class vCLIPEmbeddings(BaseModel, Embeddings):
     """MeanCLIP Embeddings model."""
 
@@ -28,9 +34,7 @@ def validate_environment(cls, values: Dict) -> Dict:
                 raise ValueError("Model must be provided during initialization.")
 
         except ImportError:
-            raise ImportError(
-                "Please ensure CLIP model is loaded"
-            )
+            raise ImportError("Please ensure CLIP model is loaded")
         return values
 
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
@@ -39,11 +43,9 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
 
         return text_features.detach().numpy()
 
-
     def embed_query(self, text: str) -> List[float]:
         return self.embed_documents([text])[0]
 
-
     def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]:
         # Open images directly as PIL images
 
@@ -52,11 +54,13 @@ def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]:
             # Encode the video to get the embeddings
             model_device = next(self.model.parameters()).device
             # Preprocess the video for the model
-            clip_images = self.load_video_for_vclip(vid_path, num_frm=self.model.num_frm,
-                                                                              max_img_size=224,
-                                                                              start_time=kwargs.get("start_time", None),
-                                                                              clip_duration=kwargs.get("clip_duration", None)
-                                                                              )
+            clip_images = self.load_video_for_vclip(
+                vid_path,
+                num_frm=self.model.num_frm,
+                max_img_size=224,
+                start_time=kwargs.get("start_time", None),
+                clip_duration=kwargs.get("clip_duration", None),
+            )
             embeddings_tensor = self.model.get_video_embeddings([clip_images])
 
             # Convert tensor to list and add to the video_features list
@@ -66,31 +70,33 @@ def embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]:
 
         return video_features
 
-
     def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
         # Load video with VideoReader
         import decord
-        decord.bridge.set_bridge('torch')
+
+        decord.bridge.set_bridge("torch")
         vr = VideoReader(vid_path, ctx=cpu(0))
         fps = vr.get_avg_fps()
         num_frames = len(vr)
-        start_idx = int(fps*kwargs.get("start_time", [0])[0])
-        end_idx = start_idx+int(fps*kwargs.get("clip_duration", [num_frames])[0])
+        start_idx = int(fps * kwargs.get("start_time", [0])[0])
+        end_idx = start_idx + int(fps * kwargs.get("clip_duration", [num_frames])[0])
 
-        frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int) # Uniform sampling
+        frame_idx = np.linspace(start_idx, end_idx, num=num_frm, endpoint=False, dtype=int)  # Uniform sampling
         clip_images = []
 
         # read images
         temp_frms = vr.get_batch(frame_idx.astype(int).tolist())
         for idx in range(temp_frms.shape[0]):
-            im = temp_frms[idx] # H W C
-            clip_images.append(toPIL(im.permute(2,0,1))) 
+            im = temp_frms[idx]  # H W C
+            clip_images.append(toPIL(im.permute(2, 0, 1)))
 
         return clip_images
 
 
 class VideoVS:
-    def __init__(self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity"):
+    def __init__(
+        self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity"
+    ):
         self.host = host
         self.port = port
         self.selected_db = selected_db
@@ -104,20 +110,19 @@ def __init__(self, host, port, selected_db, video_retriever_model, collection_na
         self.get_db_client()
         self.init_db()
 
-
     def get_db_client(self):
 
-        if self.selected_db == 'vdms':
-            print ('Connecting to VDMS db server . . .')
+        if self.selected_db == "vdms":
+            print("Connecting to VDMS db server . . .")
             self.client = VDMS_Client(host=self.host, port=self.port)
 
     def init_db(self):
-        print ('Loading db instances')
-        if self.selected_db == 'vdms':
+        print("Loading db instances")
+        if self.selected_db == "vdms":
             self.video_db = VDMS(
                 client=self.client,
                 embedding=self.video_embedder,
                 collection_name=self.video_collection,
                 engine="FaissFlat",
-                distance_strategy="IP"
-            )
\ No newline at end of file
+                distance_strategy="IP",
+            )
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
index faf5527f7..d83c0319f 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
@@ -1,19 +1,24 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import datetime
+import json
 import os
+import random
 import time as t
-from tqdm import tqdm
+
 import cv2
-import json
-import datetime
-import random
-from tzlocal import get_localzone
 import yaml
+from tqdm import tqdm
+from tzlocal import get_localzone
 
 
 def read_config(path):
-    with open(path, 'r') as f:
+    with open(path, "r") as f:
         config = yaml.safe_load(f)
     return config
-        
+
+
 def calculate_intervals(video_path, chunk_duration, clip_duration):
     cap = cv2.VideoCapture(video_path)
 
@@ -39,30 +44,30 @@ def calculate_intervals(video_path, chunk_duration, clip_duration):
     cap.release()
     return intervals
 
-def process_all_videos(config):
-    path = config['videos']
-    meta_output_dir = config['meta_output_dir']
-    selected_db = config['vector_db']['choice_of_db']
-    emb_path = config['embeddings']['path']
-    emb_type = config['embeddings']['type']
-    chunk_duration = config['chunk_duration']
-    clip_duration = config['clip_duration']
 
+def process_all_videos(config):
+    path = config["videos"]
+    meta_output_dir = config["meta_output_dir"]
+    selected_db = config["vector_db"]["choice_of_db"]
+    emb_path = config["embeddings"]["path"]
+    emb_type = config["embeddings"]["type"]
+    chunk_duration = config["chunk_duration"]
+    clip_duration = config["clip_duration"]
 
-    videos = [file for file in os.listdir(path) if file.endswith('.mp4')] # TODO: increase supported video formats
+    videos = [file for file in os.listdir(path) if file.endswith(".mp4")]  # TODO: increase supported video formats
 
     # print (f'Total {len(videos)} videos will be processed')
     metadata = {}
-    
+
     for i, each_video in enumerate(tqdm(videos)):
         metadata[each_video] = {}
         keyname = each_video
         video_path = os.path.join(path, each_video)
-        date_time = datetime.datetime.now()  # FIXME CHECK: is this correct? 
-        #date_time = t.ctime(os.stat(video_path).st_ctime)
+        date_time = datetime.datetime.now()  # FIXME CHECK: is this correct?
+        # date_time = t.ctime(os.stat(video_path).st_ctime)
         # Get the local timezone of the machine
         local_timezone = get_localzone()
-        if emb_type == 'video':
+        if emb_type == "video":
             time_format = "%a %b %d %H:%M:%S %Y"
             if not isinstance(date_time, datetime.datetime):
                 date_time = datetime.datetime.strptime(date_time, time_format)
@@ -74,46 +79,62 @@ def process_all_videos(config):
             if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration:
                 interval_count = 0
                 metadata.pop(each_video)
-                for start_frame, end_frame, start_time, end_time in calculate_intervals(video_path, chunk_duration, clip_duration):
-                    keyname = os.path.splitext(os.path.basename(video_path))[0]+f"_interval_{interval_count}"
-                    metadata[keyname] = {"timestamp":start_time}
-                    metadata[keyname].update({"date": date, "year": year, "month": month, "day": day, 
-                        "time": time, "hours": hours, "minutes": minutes, "seconds": seconds})
-                    if selected_db == 'vdms':
+                for start_frame, end_frame, start_time, end_time in calculate_intervals(
+                    video_path, chunk_duration, clip_duration
+                ):
+                    keyname = os.path.splitext(os.path.basename(video_path))[0] + f"_interval_{interval_count}"
+                    metadata[keyname] = {"timestamp": start_time}
+                    metadata[keyname].update(
+                        {
+                            "date": date,
+                            "year": year,
+                            "month": month,
+                            "day": day,
+                            "time": time,
+                            "hours": hours,
+                            "minutes": minutes,
+                            "seconds": seconds,
+                        }
+                    )
+                    if selected_db == "vdms":
                         # Localize the current time to the local timezone of the machine
-                        #Tahani might not need this
+                        # Tahani might not need this
                         current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone)
 
                         # Convert the localized time to ISO 8601 format with timezone offset
                         iso_date_time = current_time_local.isoformat()
-                        metadata[keyname]['date_time'] = {"_date": str(iso_date_time)}
+                        metadata[keyname]["date_time"] = {"_date": str(iso_date_time)}
 
                     # Open the video file
                     cap = cv2.VideoCapture(video_path)
 
-                    if int(cv2.__version__.split('.')[0]) < 3:
+                    if int(cv2.__version__.split(".")[0]) < 3:
                         fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
                     else:
                         fps = cap.get(cv2.CAP_PROP_FPS)
-                
+
                     total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
                     # get the duration
-                    metadata[keyname].update({
-                            "clip_duration":(min(total_frames,end_frame)-start_frame)/fps,
-                            'fps': fps, 
-                            'total_frames': total_frames, 
+                    metadata[keyname].update(
+                        {
+                            "clip_duration": (min(total_frames, end_frame) - start_frame) / fps,
+                            "fps": fps,
+                            "total_frames": total_frames,
                             #'embedding_path': os.path.join(emb_path, each_video+".pt"),
-                            'video_path': f'{os.path.join(path,each_video)}',
-                        })
+                            "video_path": f"{os.path.join(path,each_video)}",
+                        }
+                    )
                     cap.release()
-                    interval_count+=1
-        metadata[keyname].update({
-                'fps': fps, 
-                'total_frames': total_frames, 
+                    interval_count += 1
+        metadata[keyname].update(
+            {
+                "fps": fps,
+                "total_frames": total_frames,
                 #'embedding_path': os.path.join(emb_path, each_video+".pt"),
-                'video_path': f'{os.path.join(path,each_video)}',
-            })
+                "video_path": f"{os.path.join(path,each_video)}",
+            }
+        )
     os.makedirs(meta_output_dir, exist_ok=True)
-    metadata_file = os.path.join(meta_output_dir, f"metadata.json")
+    metadata_file = os.path.join(meta_output_dir, "metadata.json")
     with open(metadata_file, "w") as f:
-        json.dump(metadata, f, indent=4)
\ No newline at end of file
+        json.dump(metadata, f, indent=4)
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py
index 44d290397..89e5830d6 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/vclip.py
@@ -1,16 +1,23 @@
-import yaml
-import json
-import os, sys
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
-import torch
+import json
+import os
+import sys
+
 import numpy as np
-from decord import VideoReader, cpu
-from transformers import AutoTokenizer, AutoProcessor, CLIPModel
+import torch
 import torchvision.transforms as T
+import yaml
+from decord import VideoReader, cpu
+from transformers import AutoProcessor, AutoTokenizer, CLIPModel
+
 toPIL = T.ToPILImage()
 import torch.nn as nn
 from einops import rearrange
 
+
 class vCLIP(nn.Module):
     def __init__(self, cfg):
         super().__init__()
@@ -22,37 +29,28 @@ def __init__(self, cfg):
         self.processor = AutoProcessor.from_pretrained(self.model_name)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-
     def get_text_embeddings(self, texts):
-        """
-        input is list of texts
-        """
+        """Input is list of texts."""
         text_inputs = self.tokenizer(texts, padding=True, return_tensors="pt")
         text_features = self.clip.get_text_features(**text_inputs)
         return text_features
 
-
     def get_image_embeddings(self, images):
-        """
-        input is list of images
-        """
+        """Input is list of images."""
         image_inputs = self.processor(images=images, return_tensors="pt")
         image_features = self.clip.get_image_features(**image_inputs)
-        return image_features 
-
+        return image_features
 
     def get_video_embeddings(self, frames_batch):
-        """
-        input is list of list of frames in video
-        """
+        """Input is list of list of frames in video."""
         self.batch_size = len(frames_batch)
         vid_embs = []
         for frames in frames_batch:
             frame_embeddings = self.get_image_embeddings(frames)
             frame_embeddings = rearrange(frame_embeddings, "(b n) d -> b n d", b=len(frames_batch))
             # Normalize, mean aggregate and return normalized video_embeddings
-            frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True) 
+            frame_embeddings = frame_embeddings / frame_embeddings.norm(dim=-1, keepdim=True)
             video_embeddings = frame_embeddings.mean(dim=1)
             video_embeddings = video_embeddings / video_embeddings.norm(dim=-1, keepdim=True)
             vid_embs.append(video_embeddings)
-        return torch.cat(vid_embs, dim=0)
\ No newline at end of file
+        return torch.cat(vid_embs, dim=0)

From 200e3187bd3beb5ad0f1a459aef421fa58e5b2c0 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Thu, 5 Sep 2024 17:54:01 +0800
Subject: [PATCH 09/29] add dim at init, rm unused

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 .../vdms/multimodal_langchain/config.yaml     |   4 +-
 .../multimodal_langchain/ingest_videos.py     |  90 ++++++--------
 .../utils/store_embeddings.py                 |   7 +-
 .../vdms/multimodal_langchain/utils/utils.py  | 117 +++++++++---------
 4 files changed, 101 insertions(+), 117 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
index 40c327615..b2f420180 100644
--- a/comps/dataprep/vdms/multimodal_langchain/config.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -7,10 +7,10 @@ videos: uploaded_files/
 generate_frames: True
 # How do you want to generate feature embeddings?
 embeddings:
-  type: "video"
   vclip_model_name: "openai/clip-vit-base-patch32"
   vclip_num_frm: 64
-  path: "uploaded_files/embeddings"
+  vector_dimensions: 512
+  path: 'uploaded_files/embeddings'
 # VL-branch config
 vl_branch:
   cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index eeadcb1e1..ecfda8a50 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -32,32 +32,28 @@ def read_json(path):
         x = json.load(f)
     return x
 
-
-def store_into_vectordb(vs, metadata_file_path, embedding_model, config):
+def store_into_vectordb(vs, metadata_file_path, dimensions):
     GMetadata = read_json(metadata_file_path)
-    global_counter = 0
 
     total_videos = len(GMetadata.keys())
 
     for idx, (video, data) in enumerate(tqdm(GMetadata.items())):
-        image_name_list = []
-        embedding_list = []
         metadata_list = []
         ids = []
-
-        if config["embeddings"]["type"] == "video":
-            data["video"] = video
-            video_name_list = [data["video_path"]]
-            metadata_list = [data]
-            if vs.selected_db == "vdms":
-                vs.video_db.add_videos(
-                    paths=video_name_list,
-                    metadatas=metadata_list,
-                    start_time=[data["timestamp"]],
-                    clip_duration=[data["clip_duration"]],
-                )
-            else:
-                print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]")
+        
+        data['video'] = video
+        video_name_list = [data["video_path"]]
+        metadata_list = [data]
+        if vs.selected_db == 'vdms':
+            vs.video_db.add_videos(
+                paths=video_name_list,
+                metadatas=metadata_list,
+                start_time=[data['timestamp']],
+                clip_duration=[data['clip_duration']],
+                embedding_dimensions=dimensions,
+            )
+        else:
+            print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]")
 
     # clean up tmp_ folders containing frames (jpeg)
     for i in os.listdir():
@@ -71,15 +67,19 @@ def generate_video_id():
     """Generates a unique identifier for a video file."""
     return str(uuid.uuid4())
 
-
-def generate_embeddings(config, embedding_model, vs):
+def generate_embeddings(config, dimensions, vs):
     process_all_videos(config)
-    global_metadata_file_path = os.path.join(config["meta_output_dir"], "metadata.json")
-    print(f"global metadata file available at {global_metadata_file_path}")
-    store_into_vectordb(vs, global_metadata_file_path, embedding_model, config)
-
+    global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json')
+    print(f'global metadata file available at {global_metadata_file_path}')
+    store_into_vectordb(vs, global_metadata_file_path, dimensions)
+      
+@register_microservice(
+    name="opea_service@prepare_videodoc_vdms",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6007
+)
 
-@register_microservice(name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007)
 async def process_videos(files: List[UploadFile] = File(None)):
     """Ingest videos to VDMS."""
 
@@ -94,7 +94,8 @@ async def process_videos(files: List[UploadFile] = File(None)):
     emb_path = config["embeddings"]["path"]
     host = VECTORDB_SERVICE_HOST_IP
     port = int(VECTORDB_SERVICE_PORT)
-    selected_db = config["vector_db"]["choice_of_db"]
+    selected_db = config['vector_db']['choice_of_db']
+    vector_dimensions = config["embeddings"]["vector_dimensions"]
     print(f"Parsing videos {path}.")
 
     # Saving videos
@@ -118,30 +119,17 @@ async def process_videos(files: List[UploadFile] = File(None)):
                 shutil.copyfileobj(video_file.file, f)
 
     # Creating DB
-    print(
-        "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time."
-    )
-    print("Connecting to {} at {}:{}".format(selected_db, host, port))
-    # check embedding type
-    if "video" == "video":
-        # init meanclip model
-        model = setup_vclip_model(meanclip_cfg, device="cpu")
-        vs = store_embeddings.VideoVS(host, port, selected_db, model, collection_name)
-
-    else:
-        print(
-            f"ERROR: Selected embedding type in config.yaml {config['embeddings']['type']} is not in ['video', 'frame']"
-        )
-        return
-    generate_embeddings(config, model, vs)
+    print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.')
+    print('Connecting to {} at {}:{}'.format(selected_db, host, port))
 
+    # init meanclip model
+    model = setup_vclip_model(meanclip_cfg, device="cpu")
+    vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions)
 
+    generate_embeddings(config, vector_dimensions, vs)
+    
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms",
-    endpoint="/v1/dataprep/get_videos",
-    host="0.0.0.0",
-    port=6007,
-    methods=["GET"],
+    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007, methods=["GET"]
 )
 async def rag_get_file_structure():
     """Returns list of names of uploaded videos saved on the server."""
@@ -156,11 +144,7 @@ async def rag_get_file_structure():
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms",
-    endpoint="/v1/dataprep/get_file/{filename}",
-    host="0.0.0.0",
-    port=6007,
-    methods=["GET"],
+    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007, methods=["GET"]
 )
 async def rag_get_file(filename: str):
     """Download the file from remote."""
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
index 6468e5195..3ff5e21ee 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
@@ -94,9 +94,8 @@ def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
 
 
 class VideoVS:
-    def __init__(
-        self, host, port, selected_db, video_retriever_model, collection_name, chosen_video_search_type="similarity"
-    ):
+    def __init__(self, host, port, selected_db, video_retriever_model, collection_name, embedding_dimensions:int = 512, chosen_video_search_type="similarity"):
+        
         self.host = host
         self.port = port
         self.selected_db = selected_db
@@ -105,6 +104,7 @@ def __init__(
         self.video_collection = collection_name
         self.video_embedder = vCLIPEmbeddings(model=video_retriever_model)
         self.chosen_video_search_type = chosen_video_search_type
+        self.embedding_dimensions = embedding_dimensions
 
         # initialize_db
         self.get_db_client()
@@ -125,4 +125,5 @@ def init_db(self):
                 collection_name=self.video_collection,
                 engine="FaissFlat",
                 distance_strategy="IP",
+                embedding_dimensions=self.embedding_dimensions
             )
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
index d83c0319f..ba661cffd 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
@@ -67,65 +67,64 @@ def process_all_videos(config):
         # date_time = t.ctime(os.stat(video_path).st_ctime)
         # Get the local timezone of the machine
         local_timezone = get_localzone()
-        if emb_type == "video":
-            time_format = "%a %b %d %H:%M:%S %Y"
-            if not isinstance(date_time, datetime.datetime):
-                date_time = datetime.datetime.strptime(date_time, time_format)
-            time = date_time.strftime("%H:%M:%S")
-            hours, minutes, seconds = map(float, time.split(":"))
-            date = date_time.strftime("%Y-%m-%d")
-            year, month, day = map(int, date.split("-"))
-
-            if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration:
-                interval_count = 0
-                metadata.pop(each_video)
-                for start_frame, end_frame, start_time, end_time in calculate_intervals(
-                    video_path, chunk_duration, clip_duration
-                ):
-                    keyname = os.path.splitext(os.path.basename(video_path))[0] + f"_interval_{interval_count}"
-                    metadata[keyname] = {"timestamp": start_time}
-                    metadata[keyname].update(
-                        {
-                            "date": date,
-                            "year": year,
-                            "month": month,
-                            "day": day,
-                            "time": time,
-                            "hours": hours,
-                            "minutes": minutes,
-                            "seconds": seconds,
-                        }
-                    )
-                    if selected_db == "vdms":
-                        # Localize the current time to the local timezone of the machine
-                        # Tahani might not need this
-                        current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone)
-
-                        # Convert the localized time to ISO 8601 format with timezone offset
-                        iso_date_time = current_time_local.isoformat()
-                        metadata[keyname]["date_time"] = {"_date": str(iso_date_time)}
-
-                    # Open the video file
-                    cap = cv2.VideoCapture(video_path)
-
-                    if int(cv2.__version__.split(".")[0]) < 3:
-                        fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
-                    else:
-                        fps = cap.get(cv2.CAP_PROP_FPS)
-
-                    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
-                    # get the duration
-                    metadata[keyname].update(
-                        {
-                            "clip_duration": (min(total_frames, end_frame) - start_frame) / fps,
-                            "fps": fps,
-                            "total_frames": total_frames,
-                            #'embedding_path': os.path.join(emb_path, each_video+".pt"),
-                            "video_path": f"{os.path.join(path,each_video)}",
-                        }
-                    )
-                    cap.release()
-                    interval_count += 1
+        time_format = "%a %b %d %H:%M:%S %Y"
+        if not isinstance(date_time, datetime.datetime):
+            date_time = datetime.datetime.strptime(date_time, time_format)
+        time = date_time.strftime("%H:%M:%S")
+        hours, minutes, seconds = map(float, time.split(":"))
+        date = date_time.strftime("%Y-%m-%d")
+        year, month, day = map(int, date.split("-"))
+
+        if clip_duration is not None and chunk_duration is not None and clip_duration <= chunk_duration:
+            interval_count = 0
+            metadata.pop(each_video)
+            for start_frame, end_frame, start_time, end_time in calculate_intervals(
+                video_path, chunk_duration, clip_duration
+            ):
+                keyname = os.path.splitext(os.path.basename(video_path))[0] + f"_interval_{interval_count}"
+                metadata[keyname] = {"timestamp": start_time}
+                metadata[keyname].update(
+                    {
+                        "date": date,
+                        "year": year,
+                        "month": month,
+                        "day": day,
+                        "time": time,
+                        "hours": hours,
+                        "minutes": minutes,
+                        "seconds": seconds,
+                    }
+                )
+                if selected_db == "vdms":
+                    # Localize the current time to the local timezone of the machine
+                    # Tahani might not need this
+                    current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone)
+
+                    # Convert the localized time to ISO 8601 format with timezone offset
+                    iso_date_time = current_time_local.isoformat()
+                    metadata[keyname]["date_time"] = {"_date": str(iso_date_time)}
+
+                # Open the video file
+                cap = cv2.VideoCapture(video_path)
+
+                if int(cv2.__version__.split(".")[0]) < 3:
+                    fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
+                else:
+                    fps = cap.get(cv2.CAP_PROP_FPS)
+
+                total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+                # get the duration
+                metadata[keyname].update(
+                    {
+                        "clip_duration": (min(total_frames, end_frame) - start_frame) / fps,
+                        "fps": fps,
+                        "total_frames": total_frames,
+                        #'embedding_path': os.path.join(emb_path, each_video+".pt"),
+                        "video_path": f"{os.path.join(path,each_video)}",
+                    }
+                )
+                cap.release()
+                interval_count += 1
         metadata[keyname].update(
             {
                 "fps": fps,

From c6e12f1c4f56e85cbaf130b0ae2490fc5662d74c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:55:00 +0000
Subject: [PATCH 10/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../vdms/multimodal_langchain/config.yaml     |  2 +-
 .../multimodal_langchain/ingest_videos.py     | 52 +++++++++++--------
 .../utils/store_embeddings.py                 | 15 ++++--
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/config.yaml b/comps/dataprep/vdms/multimodal_langchain/config.yaml
index b2f420180..43ce11f4f 100644
--- a/comps/dataprep/vdms/multimodal_langchain/config.yaml
+++ b/comps/dataprep/vdms/multimodal_langchain/config.yaml
@@ -10,7 +10,7 @@ embeddings:
   vclip_model_name: "openai/clip-vit-base-patch32"
   vclip_num_frm: 64
   vector_dimensions: 512
-  path: 'uploaded_files/embeddings'
+  path: "uploaded_files/embeddings"
 # VL-branch config
 vl_branch:
   cfg_path: embedding/video_llama_config/video_llama_eval_only_vl.yaml
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index ecfda8a50..6092ca53f 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -32,6 +32,7 @@ def read_json(path):
         x = json.load(f)
     return x
 
+
 def store_into_vectordb(vs, metadata_file_path, dimensions):
     GMetadata = read_json(metadata_file_path)
 
@@ -40,16 +41,16 @@ def store_into_vectordb(vs, metadata_file_path, dimensions):
     for idx, (video, data) in enumerate(tqdm(GMetadata.items())):
         metadata_list = []
         ids = []
-        
-        data['video'] = video
+
+        data["video"] = video
         video_name_list = [data["video_path"]]
         metadata_list = [data]
-        if vs.selected_db == 'vdms':
+        if vs.selected_db == "vdms":
             vs.video_db.add_videos(
                 paths=video_name_list,
                 metadatas=metadata_list,
-                start_time=[data['timestamp']],
-                clip_duration=[data['clip_duration']],
+                start_time=[data["timestamp"]],
+                clip_duration=[data["clip_duration"]],
                 embedding_dimensions=dimensions,
             )
         else:
@@ -67,19 +68,15 @@ def generate_video_id():
     """Generates a unique identifier for a video file."""
     return str(uuid.uuid4())
 
+
 def generate_embeddings(config, dimensions, vs):
     process_all_videos(config)
-    global_metadata_file_path = os.path.join(config["meta_output_dir"], 'metadata.json')
-    print(f'global metadata file available at {global_metadata_file_path}')
+    global_metadata_file_path = os.path.join(config["meta_output_dir"], "metadata.json")
+    print(f"global metadata file available at {global_metadata_file_path}")
     store_into_vectordb(vs, global_metadata_file_path, dimensions)
-      
-@register_microservice(
-    name="opea_service@prepare_videodoc_vdms",
-    endpoint="/v1/dataprep",
-    host="0.0.0.0",
-    port=6007
-)
 
+
+@register_microservice(name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep", host="0.0.0.0", port=6007)
 async def process_videos(files: List[UploadFile] = File(None)):
     """Ingest videos to VDMS."""
 
@@ -94,7 +91,7 @@ async def process_videos(files: List[UploadFile] = File(None)):
     emb_path = config["embeddings"]["path"]
     host = VECTORDB_SERVICE_HOST_IP
     port = int(VECTORDB_SERVICE_PORT)
-    selected_db = config['vector_db']['choice_of_db']
+    selected_db = config["vector_db"]["choice_of_db"]
     vector_dimensions = config["embeddings"]["vector_dimensions"]
     print(f"Parsing videos {path}.")
 
@@ -119,17 +116,26 @@ async def process_videos(files: List[UploadFile] = File(None)):
                 shutil.copyfileobj(video_file.file, f)
 
     # Creating DB
-    print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.')
-    print('Connecting to {} at {}:{}'.format(selected_db, host, port))
+    print(
+        "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time."
+    )
+    print("Connecting to {} at {}:{}".format(selected_db, host, port))
 
     # init meanclip model
     model = setup_vclip_model(meanclip_cfg, device="cpu")
-    vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions)
+    vs = store_embeddings.VideoVS(
+        host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions
+    )
 
     generate_embeddings(config, vector_dimensions, vs)
-    
+
+
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007, methods=["GET"]
+    name="opea_service@prepare_videodoc_vdms",
+    endpoint="/v1/dataprep/get_videos",
+    host="0.0.0.0",
+    port=6007,
+    methods=["GET"],
 )
 async def rag_get_file_structure():
     """Returns list of names of uploaded videos saved on the server."""
@@ -144,7 +150,11 @@ async def rag_get_file_structure():
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_vdms", endpoint="/v1/dataprep/get_file/{filename}", host="0.0.0.0", port=6007, methods=["GET"]
+    name="opea_service@prepare_videodoc_vdms",
+    endpoint="/v1/dataprep/get_file/{filename}",
+    host="0.0.0.0",
+    port=6007,
+    methods=["GET"],
 )
 async def rag_get_file(filename: str):
     """Download the file from remote."""
diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
index 3ff5e21ee..ce364e669 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/store_embeddings.py
@@ -94,8 +94,17 @@ def load_video_for_vclip(self, vid_path, num_frm=4, max_img_size=224, **kwargs):
 
 
 class VideoVS:
-    def __init__(self, host, port, selected_db, video_retriever_model, collection_name, embedding_dimensions:int = 512, chosen_video_search_type="similarity"):
-        
+    def __init__(
+        self,
+        host,
+        port,
+        selected_db,
+        video_retriever_model,
+        collection_name,
+        embedding_dimensions: int = 512,
+        chosen_video_search_type="similarity",
+    ):
+
         self.host = host
         self.port = port
         self.selected_db = selected_db
@@ -125,5 +134,5 @@ def init_db(self):
                 collection_name=self.video_collection,
                 engine="FaissFlat",
                 distance_strategy="IP",
-                embedding_dimensions=self.embedding_dimensions
+                embedding_dimensions=self.embedding_dimensions,
             )

From b07036e32f5017b7954b9dbeabc822c48b3f80f9 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Fri, 6 Sep 2024 17:32:16 +0800
Subject: [PATCH 11/29] add wait after connect DB

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 .../multimodal_langchain/ingest_videos.py     | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index 6092ca53f..ab7e45d0c 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -7,6 +7,7 @@
 import uuid
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Type, Union
+import time
 
 from fastapi import File, HTTPException, UploadFile
 from fastapi.responses import FileResponse
@@ -49,9 +50,8 @@ def store_into_vectordb(vs, metadata_file_path, dimensions):
             vs.video_db.add_videos(
                 paths=video_name_list,
                 metadatas=metadata_list,
-                start_time=[data["timestamp"]],
-                clip_duration=[data["clip_duration"]],
-                embedding_dimensions=dimensions,
+                start_time=[data['timestamp']],
+                clip_duration=[data['clip_duration']]
             )
         else:
             print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]")
@@ -116,17 +116,15 @@ async def process_videos(files: List[UploadFile] = File(None)):
                 shutil.copyfileobj(video_file.file, f)
 
     # Creating DB
-    print(
-        "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time."
-    )
-    print("Connecting to {} at {}:{}".format(selected_db, host, port))
+    print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.', flush=True)
+    print('Connecting to {} at {}:{}'.format(selected_db, host, port), flush=True)
 
     # init meanclip model
     model = setup_vclip_model(meanclip_cfg, device="cpu")
-    vs = store_embeddings.VideoVS(
-        host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions
-    )
-
+    vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions)
+    print("done creating DB, sleep 5s", flush=True)
+    time.sleep(5)
+    
     generate_embeddings(config, vector_dimensions, vs)
 
 

From 0afc7b56b4c2b76c31f852f5cffb045f14ecffb2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 6 Sep 2024 09:34:38 +0000
Subject: [PATCH 12/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../multimodal_langchain/ingest_videos.py     | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index ab7e45d0c..bb8612c41 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -4,10 +4,10 @@
 import json
 import os
 import shutil
+import time
 import uuid
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Type, Union
-import time
 
 from fastapi import File, HTTPException, UploadFile
 from fastapi.responses import FileResponse
@@ -50,8 +50,8 @@ def store_into_vectordb(vs, metadata_file_path, dimensions):
             vs.video_db.add_videos(
                 paths=video_name_list,
                 metadatas=metadata_list,
-                start_time=[data['timestamp']],
-                clip_duration=[data['clip_duration']]
+                start_time=[data["timestamp"]],
+                clip_duration=[data["clip_duration"]],
             )
         else:
             print(f"ERROR: selected_db {vs.selected_db} not supported. Supported:[vdms]")
@@ -116,15 +116,20 @@ async def process_videos(files: List[UploadFile] = File(None)):
                 shutil.copyfileobj(video_file.file, f)
 
     # Creating DB
-    print ('Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.', flush=True)
-    print('Connecting to {} at {}:{}'.format(selected_db, host, port), flush=True)
+    print(
+        "Creating DB with video embedding and metadata support, \nIt may take few minutes to download and load all required models if you are running for first time.",
+        flush=True,
+    )
+    print("Connecting to {} at {}:{}".format(selected_db, host, port), flush=True)
 
     # init meanclip model
     model = setup_vclip_model(meanclip_cfg, device="cpu")
-    vs = store_embeddings.VideoVS(host, port, selected_db, model,collection_name, embedding_dimensions=vector_dimensions)
+    vs = store_embeddings.VideoVS(
+        host, port, selected_db, model, collection_name, embedding_dimensions=vector_dimensions
+    )
     print("done creating DB, sleep 5s", flush=True)
     time.sleep(5)
-    
+
     generate_embeddings(config, vector_dimensions, vs)
 
 

From 9261a4ad00df571cc96b7b8e6500ba3058138985 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Fri, 6 Sep 2024 20:56:22 +0800
Subject: [PATCH 13/29] remove unused

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 comps/dataprep/vdms/multimodal_langchain/utils/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
index ba661cffd..3bb991395 100644
--- a/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
+++ b/comps/dataprep/vdms/multimodal_langchain/utils/utils.py
@@ -50,7 +50,6 @@ def process_all_videos(config):
     meta_output_dir = config["meta_output_dir"]
     selected_db = config["vector_db"]["choice_of_db"]
     emb_path = config["embeddings"]["path"]
-    emb_type = config["embeddings"]["type"]
     chunk_duration = config["chunk_duration"]
     clip_duration = config["clip_duration"]
 

From b06006a0e08d6e2c4f5f293020931d685be4f2bd Mon Sep 17 00:00:00 2001
From: Huiling Bao <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 13:35:20 +0800
Subject: [PATCH 14/29] Update comps/dataprep/vdms/README.md

Co-authored-by: XinyuYe-Intel <xinyu.ye@intel.com>
Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 comps/dataprep/vdms/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/dataprep/vdms/README.md b/comps/dataprep/vdms/README.md
index 617761f02..71c35df8a 100644
--- a/comps/dataprep/vdms/README.md
+++ b/comps/dataprep/vdms/README.md
@@ -141,7 +141,7 @@ You can specify chunk_size and chunk_size by the following commands.
 ```bash
 curl -X POST \
     -H "Content-Type: multipart/form-data" \
-    -F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \
+    -F "files=@./LLAMA2_page6.pdf" \
     -F "chunk_size=1500" \
     -F "chunk_overlap=100" \
     http://localhost:6007/v1/dataprep

From 56c578f9c2f254e7d1deb6444f994321c36508db Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 15:45:06 +0800
Subject: [PATCH 15/29] add test script for mm case

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 ...test_dataprep_vdms_multimodal_langchain.sh | 124 ++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100755 tests/test_dataprep_vdms_multimodal_langchain.sh

diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh
new file mode 100755
index 000000000..91dfb5f68
--- /dev/null
+++ b/tests/test_dataprep_vdms_multimodal_langchain.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH
+    echo $(pwd)
+    docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile .
+
+    if [ $? -ne 0 ]; then
+        echo "opea/dataprep-vdms built fail"
+        exit 1
+    else
+        echo "opea/dataprep-vdms built successful"
+    fi
+    docker pull intellabs/vdms:latest
+}
+
+function start_service() {
+    VDMS_PORT=5043
+    docker run -d --name="test-comps-dataprep-vdms" -p $VDMS_PORT:55555 intellabs/vdms:latest
+    dataprep_service_port=5013
+    COLLECTION_NAME="test-comps"
+    docker run -d --name="test-comps-dataprep-vdms-server" -e COLLECTION_NAME=$COLLECTION_NAME -e no_proxy=$no_proxy -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VDMS_HOST=$ip_address -e VDMS_PORT=$VDMS_PORT -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-vdms:comps
+    sleep 30s
+}
+
+function validate_microservice() {
+    cd $LOG_PATH
+    wget -q https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4
+    dataprep_service_port=5013
+
+    # test /v1/dataprep upload file
+    URL="http://$ip_address:$dataprep_service_port/v1/dataprep"
+    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} )
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ dataprep-upload-videos ] HTTP status is 200. Checking content..."
+        local CONTENT=$(http_proxy="" curl -s -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} | tee ${LOG_PATH}/dataprep-upload-videos.log)
+        if echo "$CONTENT" | grep "Videos ingested successfully"; then
+            echo "[ dataprep-upload-videos ] Content is correct."
+        else
+            echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT"
+            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log
+            docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log
+            exit 1
+        fi
+    else
+        echo "[ dataprep-upload-videos ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log
+        docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log
+        exit 1
+    fi
+    rm ./silence_girl.mp4
+
+    # test /v1/dataprep/get_videos
+    URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos"
+    
+    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL})
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ dataprep-get-videos ]HTTP status is 200. Checking content..."
+        local CONTENT=$(http_proxy="" curl -s -X GET ${URL} | tee ${LOG_PATH}/dataprep-get-videos.log)
+        if echo "$CONTENT" | grep "silence_girl"; then
+            echo "[ dataprep-get-videos ] Content is correct."
+        else
+            echo "[ dataprep-get-videos ] Content is not correct. Received content was $CONTENT"
+            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log
+            exit 1
+        fi
+    else
+        echo "[ dataprep-get-videos ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log
+        exit 1
+    fi
+    
+    # test /v1/dataprep/get_file/{filename}
+    file_list=$(http_proxy="" curl -s -X GET http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos)
+    echo $file_list
+    filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g')
+    URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}"
+    
+    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL})
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ download_file ]HTTP status is 200. Checking content..."
+        CONTENT=$(ls -l)
+        if echo "$CONTENT" | grep "silence_girl"; then
+            echo "[ download_file ] Content is correct."
+        else
+            echo "[ download_file ] Content is not correct. $CONTENT"
+            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log
+            exit 1
+        fi
+    else
+        echo "[ download_file ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-dataprep-vdms*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main

From dc11dc2588e2a12782dfbc69241f3c02ac6b1ee2 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 15:45:31 +0800
Subject: [PATCH 16/29] add return value and update readme

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 comps/dataprep/vdms/multimodal_langchain/README.md     | 10 +++++++++-
 .../vdms/multimodal_langchain/ingest_videos.py         |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md
index 54e878b65..96d719c9f 100644
--- a/comps/dataprep/vdms/multimodal_langchain/README.md
+++ b/comps/dataprep/vdms/multimodal_langchain/README.md
@@ -112,5 +112,13 @@ curl -X POST \
 - List of uploaded files
 
 ```bash
-curl -X POST http://localhost:6007/v1/dataprep/get_videos
+curl -X GET http://localhost:6007/v1/dataprep/get_videos
+```
+
+- Download uploaded files
+
+Please use the file name from the list
+
+```bash
+curl -X GET http://localhost:6007/v1/dataprep/get_file/${filename}
 ```
diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index bb8612c41..91bd5d9be 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -131,6 +131,8 @@ async def process_videos(files: List[UploadFile] = File(None)):
     time.sleep(5)
 
     generate_embeddings(config, vector_dimensions, vs)
+    
+    return {"message": "Videos ingested successfully"}
 
 
 @register_microservice(

From 04e12249734922a180f2d9c77f2516ab90b39b90 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:46:13 +0000
Subject: [PATCH 17/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/dataprep/vdms/multimodal_langchain/ingest_videos.py | 2 +-
 tests/test_dataprep_vdms_multimodal_langchain.sh          | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
index 91bd5d9be..132913e3f 100644
--- a/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
+++ b/comps/dataprep/vdms/multimodal_langchain/ingest_videos.py
@@ -131,7 +131,7 @@ async def process_videos(files: List[UploadFile] = File(None)):
     time.sleep(5)
 
     generate_embeddings(config, vector_dimensions, vs)
-    
+
     return {"message": "Videos ingested successfully"}
 
 
diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh
index 91dfb5f68..686269da2 100755
--- a/tests/test_dataprep_vdms_multimodal_langchain.sh
+++ b/tests/test_dataprep_vdms_multimodal_langchain.sh
@@ -60,7 +60,7 @@ function validate_microservice() {
 
     # test /v1/dataprep/get_videos
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos"
-    
+
     HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL})
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ dataprep-get-videos ]HTTP status is 200. Checking content..."
@@ -77,13 +77,13 @@ function validate_microservice() {
         docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log
         exit 1
     fi
-    
+
     # test /v1/dataprep/get_file/{filename}
     file_list=$(http_proxy="" curl -s -X GET http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos)
     echo $file_list
     filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g')
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}"
-    
+
     HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL})
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ download_file ]HTTP status is 200. Checking content..."

From ea465e4d933daa10ccc30abf1351e05c8e726ba0 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 17:05:14 +0800
Subject: [PATCH 18/29] check bug

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 ...test_dataprep_vdms_multimodal_langchain.sh | 42 +++++++------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh
index 686269da2..a079f8391 100755
--- a/tests/test_dataprep_vdms_multimodal_langchain.sh
+++ b/tests/test_dataprep_vdms_multimodal_langchain.sh
@@ -33,29 +33,23 @@ function start_service() {
 
 function validate_microservice() {
     cd $LOG_PATH
-    wget -q https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4
-    dataprep_service_port=5013
+    wget https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4 -O silence_girl.mp4
+    ls && sleep 5
 
     # test /v1/dataprep upload file
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep"
-    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} )
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ dataprep-upload-videos ] HTTP status is 200. Checking content..."
-        local CONTENT=$(http_proxy="" curl -s -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL} | tee ${LOG_PATH}/dataprep-upload-videos.log)
-        if echo "$CONTENT" | grep "Videos ingested successfully"; then
-            echo "[ dataprep-upload-videos ] Content is correct."
-        else
-            echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT"
-            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log
-            docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log
-            exit 1
-        fi
+    CONTENT=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL})
+    echo "[ dataprep-upload-videos ] Checking content..."
+    if echo "$CONTENT" | grep "Videos ingested successfully"; then
+        echo "[ dataprep-upload-videos ] Content is correct."
     else
-        echo "[ dataprep-upload-videos ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT"
         docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log
         docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log
         exit 1
     fi
+
+    sleep 1s
     rm ./silence_girl.mp4
 
     # test /v1/dataprep/get_videos
@@ -84,22 +78,16 @@ function validate_microservice() {
     filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g')
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}"
 
-    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL})
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ download_file ]HTTP status is 200. Checking content..."
-        CONTENT=$(ls -l)
-        if echo "$CONTENT" | grep "silence_girl"; then
-            echo "[ download_file ] Content is correct."
-        else
-            echo "[ download_file ] Content is not correct. $CONTENT"
-            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log
-            exit 1
-        fi
+    http_proxy="" wget ${URL}
+    CONTENT=$(ls)
+    if echo "$CONTENT" | grep "silence_girl"; then
+        echo "[ download_file ] Content is correct."
     else
-        echo "[ download_file ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        echo "[ download_file ] Content is not correct. $CONTENT"
         docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/download_file.log
         exit 1
     fi
+
 }
 
 function stop_docker() {

From acc7a05d1655fff4ec68c53c1943c273cf7f8ea6 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 19:01:34 +0800
Subject: [PATCH 19/29] fix mm-script

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 ...test_dataprep_vdms_multimodal_langchain.sh | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/test_dataprep_vdms_multimodal_langchain.sh
index a079f8391..e53e528b5 100755
--- a/tests/test_dataprep_vdms_multimodal_langchain.sh
+++ b/tests/test_dataprep_vdms_multimodal_langchain.sh
@@ -34,17 +34,28 @@ function start_service() {
 function validate_microservice() {
     cd $LOG_PATH
     wget https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4 -O silence_girl.mp4
-    ls && sleep 5
+    sleep 5
 
     # test /v1/dataprep upload file
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep"
-    CONTENT=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL})
-    echo "[ dataprep-upload-videos ] Checking content..."
-    if echo "$CONTENT" | grep "Videos ingested successfully"; then
-        echo "[ dataprep-upload-videos ] Content is correct."
+
+    response=$(http_proxy="" curl -s -w "\n%{http_code}" -X POST -F 'files=@./silence_girl.mp4' -H 'Content-Type: multipart/form-data' ${URL})
+    CONTENT=$(echo "$response" | sed -e '$ d')
+    HTTP_STATUS=$(echo "$response" | tail -n 1)
+
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ dataprep-upload-videos ]  HTTP status is 200. Checking content..."
+        if echo "$CONTENT" | grep "Videos ingested successfully"; then
+            echo "[ dataprep-upload-videos ] Content is correct."
+        else
+            echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT"
+            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log
+            docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log
+            exit 1
+        fi
     else
-        echo "[ dataprep-upload-videos ] Content is not correct. Received content was $CONTENT"
-        docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-videos.log
+        echo "[ dataprep-upload-videos ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-get-videos.log
         docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-videos_vdms.log
         exit 1
     fi
@@ -55,10 +66,12 @@ function validate_microservice() {
     # test /v1/dataprep/get_videos
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos"
 
-    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X GET ${URL})
+    response=$(http_proxy="" curl -s -w "\n%{http_code}" -X GET ${URL})
+    CONTENT=$(echo "$response" | sed -e '$ d')
+    HTTP_STATUS=$(echo "$response" | tail -n 1)
+
     if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ dataprep-get-videos ]HTTP status is 200. Checking content..."
-        local CONTENT=$(http_proxy="" curl -s -X GET ${URL} | tee ${LOG_PATH}/dataprep-get-videos.log)
+        echo "[ dataprep-get-videos ] HTTP status is 200. Checking content..."
         if echo "$CONTENT" | grep "silence_girl"; then
             echo "[ dataprep-get-videos ] Content is correct."
         else
@@ -73,8 +86,7 @@ function validate_microservice() {
     fi
 
     # test /v1/dataprep/get_file/{filename}
-    file_list=$(http_proxy="" curl -s -X GET http://$ip_address:$dataprep_service_port/v1/dataprep/get_videos)
-    echo $file_list
+    file_list=$CONTENT
     filename=$(echo $file_list | sed 's/^\[//;s/\]$//;s/,.*//;s/"//g')
     URL="http://$ip_address:$dataprep_service_port/v1/dataprep/get_file/${filename}"
 

From a66da363a4ad869bf9f5fafc405c7cb3554bb091 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 19:44:37 +0800
Subject: [PATCH 20/29] add into dataprep workflow

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 .github/workflows/docker/compose/dataprep-compose-cd.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml
index e17783051..83c053dd6 100644
--- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml
+++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml
@@ -23,3 +23,6 @@ services:
     build:
       dockerfile: comps/dataprep/pinecone/docker/Dockerfile
     image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest}
+  dataprep-vdms:
+    build:
+      dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile 

From 2699710e3d61a033fb5af54d111c1eb13eac3291 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Tue, 10 Sep 2024 19:48:08 +0800
Subject: [PATCH 21/29] rm whitespace

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 .github/workflows/docker/compose/dataprep-compose-cd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml
index 83c053dd6..384f0ceca 100644
--- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml
+++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml
@@ -25,4 +25,4 @@ services:
     image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest}
   dataprep-vdms:
     build:
-      dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile 
+      dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile

From ebe5a91927f3ace896b2740d3aeceab78f0a2bf8 Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 10 Sep 2024 16:16:29 -0700
Subject: [PATCH 22/29] updated readme and added test script

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 tests/dataprep-upload-file.log        |  1 +
 tests/test_dataprep_vdms_langchain.sh | 83 +++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 tests/dataprep-upload-file.log
 create mode 100644 tests/test_dataprep_vdms_langchain.sh

diff --git a/tests/dataprep-upload-file.log b/tests/dataprep-upload-file.log
new file mode 100644
index 000000000..d0c4bf41f
--- /dev/null
+++ b/tests/dataprep-upload-file.log
@@ -0,0 +1 @@
+{"status":200,"message":"Data preparation succeeded"}
\ No newline at end of file
diff --git a/tests/test_dataprep_vdms_langchain.sh b/tests/test_dataprep_vdms_langchain.sh
new file mode 100644
index 000000000..817d0a89d
--- /dev/null
+++ b/tests/test_dataprep_vdms_langchain.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH
+    echo $(pwd)
+    docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile .
+
+    if [ $? -ne 0 ]; then
+        echo "opea/dataprep-vdms built fail"
+        exit 1
+    else
+        echo "opea/dataprep-vdms built successful"
+    fi
+    docker pull intellabs/vdms:latest
+}
+
+function start_service() {
+    VDMS_PORT=5043
+    docker run -d --name="test-comps-dataprep-vdms" -p $VDMS_PORT:55555 intellabs/vdms:latest
+    dataprep_service_port=5013
+    COLLECTION_NAME="test-comps"
+    docker run -d --name="test-comps-dataprep-vdms-server" -e COLLECTION_NAME=$COLLECTION_NAME -e no_proxy=$no_proxy -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VDMS_HOST=$ip_address -e VDMS_PORT=$VDMS_PORT -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-vdms:comps
+    sleep 30s
+}
+
+function validate_microservice() {
+    cd $LOG_PATH
+    
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+
+    dataprep_service_port=5013
+
+    URL="http://$ip_address:$dataprep_service_port/v1/dataprep"
+    HTTP_STATUS=$(http_proxy="" curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' ${URL} )
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ dataprep-upload-file ] HTTP status is 200. Checking content..."
+        local CONTENT=$(http_proxy="" curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' ${URL} | tee ${LOG_PATH}/dataprep-upload-file.log)
+        if echo "$CONTENT" | grep "Data preparation succeeded"; then
+            echo "[ dataprep-upload-file ] Content is correct."
+        else
+            echo "[ dataprep-upload-file ] Content is not correct. Received content was $CONTENT"
+            docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-file.log
+            docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-file_vdms.log
+            exit 1
+        fi
+    else
+        echo "[ dataprep-upload-file ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-vdms-server >> ${LOG_PATH}/dataprep-upload-file.log
+        docker logs test-comps-dataprep-vdms >> ${LOG_PATH}/dataprep-upload-file_vdms.log
+        exit 1
+    fi
+    rm ./dataprep_file.txt
+
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-dataprep-vdms*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main

From 2b6f6d5557085efa4a38b8fef45ca169befe4781 Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 10 Sep 2024 16:17:11 -0700
Subject: [PATCH 23/29] removed unused file

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 tests/dataprep-upload-file.log | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 tests/dataprep-upload-file.log

diff --git a/tests/dataprep-upload-file.log b/tests/dataprep-upload-file.log
deleted file mode 100644
index d0c4bf41f..000000000
--- a/tests/dataprep-upload-file.log
+++ /dev/null
@@ -1 +0,0 @@
-{"status":200,"message":"Data preparation succeeded"}
\ No newline at end of file

From 808f1f7bba270c9fd545b85805643cbf3e02d920 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 23:20:54 +0000
Subject: [PATCH 24/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_dataprep_vdms_langchain.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_dataprep_vdms_langchain.sh b/tests/test_dataprep_vdms_langchain.sh
index 817d0a89d..7a7d14fe4 100644
--- a/tests/test_dataprep_vdms_langchain.sh
+++ b/tests/test_dataprep_vdms_langchain.sh
@@ -33,7 +33,7 @@ function start_service() {
 
 function validate_microservice() {
     cd $LOG_PATH
-    
+
     echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
 
     dataprep_service_port=5013

From 9fe2571b96fbc40f73af855a72d313481455ca04 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Wed, 11 Sep 2024 07:39:04 +0800
Subject: [PATCH 25/29] move test script

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 tests/{ => dataprep}/test_dataprep_vdms_langchain.sh            | 0
 tests/{ => dataprep}/test_dataprep_vdms_multimodal_langchain.sh | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/{ => dataprep}/test_dataprep_vdms_langchain.sh (100%)
 rename tests/{ => dataprep}/test_dataprep_vdms_multimodal_langchain.sh (100%)

diff --git a/tests/test_dataprep_vdms_langchain.sh b/tests/dataprep/test_dataprep_vdms_langchain.sh
similarity index 100%
rename from tests/test_dataprep_vdms_langchain.sh
rename to tests/dataprep/test_dataprep_vdms_langchain.sh
diff --git a/tests/test_dataprep_vdms_multimodal_langchain.sh b/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh
similarity index 100%
rename from tests/test_dataprep_vdms_multimodal_langchain.sh
rename to tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh

From ebe7c7d8c2592443d12e31955015e25624aeb51b Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 10 Sep 2024 18:37:36 -0700
Subject: [PATCH 26/29] restructured repo

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 comps/dataprep/vdms/README.md                 |  16 +--
 .../vdms/langchain/{docker => }/Dockerfile    |   4 +
 .../docker-compose-dataprep-vdms.yaml         |   0
 .../vdms/langchain/prepare_doc_vdms.py        | 108 +++++++++++++++---
 .../{docker => }/Dockerfile                   |   0
 .../vdms/multimodal_langchain/README.md       |   4 +-
 .../docker-compose-dataprep-vdms.yaml         |   0
 7 files changed, 106 insertions(+), 26 deletions(-)
 rename comps/dataprep/vdms/langchain/{docker => }/Dockerfile (86%)
 rename comps/dataprep/vdms/langchain/{docker => }/docker-compose-dataprep-vdms.yaml (100%)
 rename comps/dataprep/vdms/multimodal_langchain/{docker => }/Dockerfile (100%)
 rename comps/dataprep/vdms/multimodal_langchain/{docker => }/docker-compose-dataprep-vdms.yaml (100%)

diff --git a/comps/dataprep/vdms/README.md b/comps/dataprep/vdms/README.md
index 71c35df8a..2a0d2ca45 100644
--- a/comps/dataprep/vdms/README.md
+++ b/comps/dataprep/vdms/README.md
@@ -10,7 +10,7 @@ We organized the folders in the same way, so you can use either framework for da
 
 ## 1.1 Install Requirements
 
-- option 1: Install Single-process version (for 1-10 files processing)
+Install Single-process version (for 1-10 files processing)
 
 ```bash
 apt-get update
@@ -46,7 +46,7 @@ export PYTHONPATH=${path_to_comps}
 
 Start document preparation microservice for VDMS with below command.
 
-- option 1: Start single-process version (for 1-10 files processing)
+Start single-process version (for 1-10 files processing)
 
 ```bash
 python prepare_doc_vdms.py
@@ -82,22 +82,22 @@ export PYTHONPATH=${path_to_comps}
 
 - Build docker image with langchain
 
-* option 1: Start single-process version (for 1-10 files processing)
+Start single-process version (for 1-10 files processing)
 
 ```bash
 cd ../../../
-docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile .
+docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/Dockerfile .
 ```
 
 <!-- - option 2: Start multi-process version (for >10 files processing)
 
 ```bash
 cd ../../../../
-docker build -t opea/dataprep-on-ray-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain_ray/docker/Dockerfile . -->
+docker build -t opea/dataprep-on-ray-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain_ray/Dockerfile . -->
 
 ## 2.4 Run Docker with CLI
 
-- option 1: Start single-process version (for 1-10 files processing)
+Start single-process version (for 1-10 files processing)
 
 ```bash
 docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \
@@ -127,7 +127,7 @@ Once document preparation microservice for VDMS is started, user can use below c
 
 Make sure the file path after `files=@` is correct.
 
-<!-- - Single file upload
+- Single file upload
 
 ```bash
 curl -X POST \
@@ -186,4 +186,4 @@ try:
     print("Request successful!")
 except requests.exceptions.RequestException as e:
     print("An error occurred:", e)
-``` -->
+```
diff --git a/comps/dataprep/vdms/langchain/docker/Dockerfile b/comps/dataprep/vdms/langchain/Dockerfile
similarity index 86%
rename from comps/dataprep/vdms/langchain/docker/Dockerfile
rename to comps/dataprep/vdms/langchain/Dockerfile
index 606b0a4e1..df5b75544 100644
--- a/comps/dataprep/vdms/langchain/docker/Dockerfile
+++ b/comps/dataprep/vdms/langchain/Dockerfile
@@ -28,6 +28,10 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \
 
 ENV PYTHONPATH=/home/user
 
+USER root
+
+RUN mkdir -p /home/user/comps/dataprep/vdms/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/langchain
+
 USER user
 
 WORKDIR /home/user/comps/dataprep/vdms/langchain
diff --git a/comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/langchain/docker-compose-dataprep-vdms.yaml
similarity index 100%
rename from comps/dataprep/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml
rename to comps/dataprep/vdms/langchain/docker-compose-dataprep-vdms.yaml
diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
index e6f7d0072..8bfc309bb 100644
--- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
+++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
@@ -2,30 +2,24 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-
+import json
 from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
 from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
 from langchain_text_splitters import HTMLHeaderTextSplitter
-
-from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
-from comps.dataprep.utils import document_loader, get_separators, get_tables_result
+from fastapi import Body, File, Form, HTTPException, UploadFile
+from typing import List, Optional, Union
+from comps import CustomLogger,DocPath, opea_microservices, register_microservice
+from comps.dataprep.utils import document_loader, get_separators, get_tables_result, encode_filename, save_content_to_local_disk, parse_html, create_upload_folder
 
 tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
 client = VDMS_Client(VDMS_HOST, int(VDMS_PORT))
+logger = CustomLogger("prepare_doc_redis")
+logflag = os.getenv("LOGFLAG", False)
+upload_folder = "./uploaded_files/"
 
-
-@register_microservice(
-    name="opea_service@prepare_doc_vdms",
-    endpoint="/v1/dataprep",
-    host="0.0.0.0",
-    port=6007,
-    input_datatype=DocPath,
-    output_datatype=None,
-)
-@opea_telemetry
-def ingest_documents(doc_path: DocPath):
+def ingest_data_to_vdms(doc_path: DocPath):
     """Ingest document to VDMS."""
     path = doc_path.path
     print(f"Parsing document {doc_path}.")
@@ -42,7 +36,7 @@ def ingest_documents(doc_path: DocPath):
             chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators()
         )
 
-    content = document_loader(doc_path)
+    content = document_loader(path)
     chunks = text_splitter.split_text(content)
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
@@ -76,5 +70,87 @@ def ingest_documents(doc_path: DocPath):
         print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
 
 
+@register_microservice(
+    name="opea_service@prepare_doc_vdms",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6007,
+)
+async def ingest_documents(
+    files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
+    link_list: Optional[str] = Form(None),
+    chunk_size: int = Form(1500),
+    chunk_overlap: int = Form(100),
+    process_table: bool = Form(False),
+    table_strategy: str = Form("fast"),
+):
+    if logflag:
+        logger.info(f"[ upload ] files:{files}")
+        logger.info(f"[ upload ] link_list:{link_list}")
+
+    if files:
+        if not isinstance(files, list):
+            files = [files]
+        uploaded_files = []
+
+        for file in files:
+            encode_file = encode_filename(file.filename)
+            doc_id = "file:" + encode_file
+            if logflag:
+                logger.info(f"[ upload ] processing file {doc_id}")
+
+            save_path = upload_folder + encode_file
+            await save_content_to_local_disk(save_path, file)
+            ingest_data_to_vdms(
+                DocPath(
+                    path=save_path,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    process_table=process_table,
+                    table_strategy=table_strategy,
+                )
+            )
+            uploaded_files.append(save_path)
+            if logflag:
+                logger.info(f"[ upload ] Successfully saved file {save_path}")
+
+        result = {"status": 200, "message": "Data preparation succeeded"}
+        if logflag:
+            logger.info(result)
+        return result
+
+    if link_list:
+        link_list = json.loads(link_list)  # Parse JSON string to list
+        if not isinstance(link_list, list):
+            raise HTTPException(status_code=400, detail=f"Link_list {link_list} should be a list.")
+        for link in link_list:
+            encoded_link = encode_filename(link)
+            doc_id = "file:" + encoded_link + ".txt"
+            if logflag:
+                logger.info(f"[ upload ] processing link {doc_id}")
+
+            # check whether the link file already exists
+            
+            save_path = upload_folder + encoded_link + ".txt"
+            content = parse_html([link])[0][0]
+            await save_content_to_local_disk(save_path, content)
+            ingest_data_to_vdms(
+                DocPath(
+                    path=save_path,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    process_table=process_table,
+                    table_strategy=table_strategy,
+                )
+            )
+        if logflag:
+            logger.info(f"[ upload ] Successfully saved link list {link_list}")
+        return {"status": 200, "message": "Data preparation succeeded"}
+
+    raise HTTPException(status_code=400, detail="Must provide either a file or a string list.")
+
+
+
 if __name__ == "__main__":
+    create_upload_folder(upload_folder)
     opea_microservices["opea_service@prepare_doc_vdms"].start()
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile b/comps/dataprep/vdms/multimodal_langchain/Dockerfile
similarity index 100%
rename from comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
rename to comps/dataprep/vdms/multimodal_langchain/Dockerfile
diff --git a/comps/dataprep/vdms/multimodal_langchain/README.md b/comps/dataprep/vdms/multimodal_langchain/README.md
index 96d719c9f..0b5b721fa 100644
--- a/comps/dataprep/vdms/multimodal_langchain/README.md
+++ b/comps/dataprep/vdms/multimodal_langchain/README.md
@@ -67,14 +67,14 @@ export your_hf_api_token="{your_hf_token}"
 
 ```bash
 cd ../../../
- docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile .
+ docker build -t opea/dataprep-vdms:latest --network host --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/Dockerfile .
 
 ```
 
 ## 2.4 Run Docker Compose
 
 ```bash
-docker compose -f comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml up -d
+docker compose -f comps/dataprep/vdms/multimodal_langchain/docker-compose-dataprep-vdms.yaml up -d
 ```
 
 # 🚀3. Status Microservice
diff --git a/comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/multimodal_langchain/docker-compose-dataprep-vdms.yaml
similarity index 100%
rename from comps/dataprep/vdms/multimodal_langchain/docker/docker-compose-dataprep-vdms.yaml
rename to comps/dataprep/vdms/multimodal_langchain/docker-compose-dataprep-vdms.yaml

From cb2c033c5a4379347d8900e49df2552b717e9ea3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Sep 2024 01:41:27 +0000
Subject: [PATCH 27/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../vdms/langchain/prepare_doc_vdms.py        | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
index 8bfc309bb..c89c7517f 100644
--- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
+++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
@@ -1,17 +1,27 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import json
+import os
+from typing import List, Optional, Union
+
 from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT
+from fastapi import Body, File, Form, HTTPException, UploadFile
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
 from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
 from langchain_text_splitters import HTMLHeaderTextSplitter
-from fastapi import Body, File, Form, HTTPException, UploadFile
-from typing import List, Optional, Union
-from comps import CustomLogger,DocPath, opea_microservices, register_microservice
-from comps.dataprep.utils import document_loader, get_separators, get_tables_result, encode_filename, save_content_to_local_disk, parse_html, create_upload_folder
+
+from comps import CustomLogger, DocPath, opea_microservices, register_microservice
+from comps.dataprep.utils import (
+    create_upload_folder,
+    document_loader,
+    encode_filename,
+    get_separators,
+    get_tables_result,
+    parse_html,
+    save_content_to_local_disk,
+)
 
 tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
 client = VDMS_Client(VDMS_HOST, int(VDMS_PORT))
@@ -19,6 +29,7 @@
 logflag = os.getenv("LOGFLAG", False)
 upload_folder = "./uploaded_files/"
 
+
 def ingest_data_to_vdms(doc_path: DocPath):
     """Ingest document to VDMS."""
     path = doc_path.path
@@ -130,7 +141,7 @@ async def ingest_documents(
                 logger.info(f"[ upload ] processing link {doc_id}")
 
             # check whether the link file already exists
-            
+
             save_path = upload_folder + encoded_link + ".txt"
             content = parse_html([link])[0][0]
             await save_content_to_local_disk(save_path, content)
@@ -150,7 +161,6 @@ async def ingest_documents(
     raise HTTPException(status_code=400, detail="Must provide either a file or a string list.")
 
 
-
 if __name__ == "__main__":
     create_upload_folder(upload_folder)
     opea_microservices["opea_service@prepare_doc_vdms"].start()

From a8d2657498a0b41ec645b303eec619f743566bca Mon Sep 17 00:00:00 2001
From: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
Date: Tue, 10 Sep 2024 18:53:14 -0700
Subject: [PATCH 28/29] updates path in test script

Signed-off-by: srinarayan-srikanthan <srinarayan.srikanthan@intel.com>
---
 tests/dataprep/test_dataprep_vdms_langchain.sh            | 2 +-
 tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/dataprep/test_dataprep_vdms_langchain.sh b/tests/dataprep/test_dataprep_vdms_langchain.sh
index 7a7d14fe4..4fe0d0f0a 100644
--- a/tests/dataprep/test_dataprep_vdms_langchain.sh
+++ b/tests/dataprep/test_dataprep_vdms_langchain.sh
@@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH
     echo $(pwd)
-    docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile .
+    docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/Dockerfile .
 
     if [ $? -ne 0 ]; then
         echo "opea/dataprep-vdms built fail"
diff --git a/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh b/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh
index e53e528b5..3dc70a7a3 100755
--- a/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh
+++ b/tests/dataprep/test_dataprep_vdms_multimodal_langchain.sh
@@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH
     echo $(pwd)
-    docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile .
+    docker build --no-cache -t opea/dataprep-vdms:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/multimodal_langchain/Dockerfile .
 
     if [ $? -ne 0 ]; then
         echo "opea/dataprep-vdms built fail"

From 1fbc34314f98e4646607707cf228524e35824181 Mon Sep 17 00:00:00 2001
From: BaoHuiling <huiling.bao@intel.com>
Date: Wed, 11 Sep 2024 12:24:14 +0800
Subject: [PATCH 29/29] add name for build

Signed-off-by: BaoHuiling <huiling.bao@intel.com>
---
 .github/workflows/docker/compose/dataprep-compose-cd.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml
index bd292fc47..80f46ab7e 100644
--- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml
+++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml
@@ -26,3 +26,4 @@ services:
   dataprep-vdms:
     build:
       dockerfile: comps/dataprep/vdms/multimodal_langchain/docker/Dockerfile
+    image: ${REGISTRY:-opea}/dataprep-vdms:${TAG:-latest}